1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 34e2b4712SSatish Balay /* 44e2b4712SSatish Balay Factorization code for BAIJ format. 54e2b4712SSatish Balay */ 64e2b4712SSatish Balay 77c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 8*c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 94e2b4712SSatish Balay 104a2ae208SSatish Balay #undef __FUNCT__ 114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 12dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 13f1af5d2fSBarry Smith { 14f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15dfbe8321SBarry Smith PetscErrorCode ierr; 16690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 17690b6cddSBarry Smith PetscInt *diag = a->diag; 18f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 1987828ca2SBarry Smith PetscScalar s1,*x,*b; 20f1af5d2fSBarry Smith 21f1af5d2fSBarry Smith PetscFunctionBegin; 22ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 231ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 241ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25f1af5d2fSBarry Smith 26f1af5d2fSBarry Smith /* forward solve the U^T */ 27f1af5d2fSBarry Smith for (i=0; i<n; i++) { 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith v = aa + diag[i]; 30f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 31ef66eb69SBarry Smith s1 = (*v++)*x[i]; 32f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 33f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 34f1af5d2fSBarry Smith while (nz--) { 35f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 36f1af5d2fSBarry Smith } 37f1af5d2fSBarry Smith x[i] = s1; 38f1af5d2fSBarry Smith } 39f1af5d2fSBarry Smith /* backward solve the L^T */ 40f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 41f1af5d2fSBarry Smith v = aa + diag[i] - 1; 42f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 43f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 44f1af5d2fSBarry Smith s1 = x[i]; 45f1af5d2fSBarry Smith while (nz--) { 46f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 47f1af5d2fSBarry Smith } 48f1af5d2fSBarry Smith } 491ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 501ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 51dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 52f1af5d2fSBarry Smith PetscFunctionReturn(0); 53f1af5d2fSBarry Smith } 54f1af5d2fSBarry Smith 554a2ae208SSatish Balay #undef __FUNCT__ 564a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 57dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 58f1af5d2fSBarry Smith { 59f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 60dfbe8321SBarry Smith PetscErrorCode ierr; 61690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 62690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 63f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6487828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6587828ca2SBarry Smith PetscScalar *x,*b; 66f1af5d2fSBarry Smith 67f1af5d2fSBarry Smith PetscFunctionBegin; 68ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 691ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 701ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 71f1af5d2fSBarry Smith 72f1af5d2fSBarry Smith /* forward solve the U^T */ 73f1af5d2fSBarry Smith idx = 0; 74f1af5d2fSBarry Smith for (i=0; i<n; i++) { 75f1af5d2fSBarry Smith 76f1af5d2fSBarry Smith v = aa + 4*diag[i]; 77f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 78ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 79f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 80f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 81f1af5d2fSBarry Smith v += 4; 82f1af5d2fSBarry Smith 83f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 84f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 85f1af5d2fSBarry Smith while (nz--) { 86f1af5d2fSBarry Smith oidx = 2*(*vi++); 87f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 88f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 89f1af5d2fSBarry Smith v += 4; 90f1af5d2fSBarry Smith } 91f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 92f1af5d2fSBarry Smith idx += 2; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith /* backward solve the L^T */ 95f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 96f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 97f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 98f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 99f1af5d2fSBarry Smith idt = 2*i; 100f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 101f1af5d2fSBarry Smith while (nz--) { 102f1af5d2fSBarry Smith idx = 2*(*vi--); 103f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 104f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 105f1af5d2fSBarry Smith v -= 4; 106f1af5d2fSBarry Smith } 107f1af5d2fSBarry Smith } 1081ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1091ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 110dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 111f1af5d2fSBarry Smith PetscFunctionReturn(0); 112f1af5d2fSBarry Smith } 113f1af5d2fSBarry Smith 1144a2ae208SSatish Balay #undef __FUNCT__ 1154a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 116dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 117f1af5d2fSBarry Smith { 118f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 119dfbe8321SBarry Smith PetscErrorCode ierr; 120690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 121690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 122f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12387828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 12487828ca2SBarry Smith PetscScalar *x,*b; 125f1af5d2fSBarry Smith 126f1af5d2fSBarry Smith PetscFunctionBegin; 127ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1281ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1291ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 130f1af5d2fSBarry Smith 131f1af5d2fSBarry Smith /* forward solve the U^T */ 132f1af5d2fSBarry Smith idx = 0; 133f1af5d2fSBarry Smith for (i=0; i<n; i++) { 134f1af5d2fSBarry Smith 135f1af5d2fSBarry Smith v = aa + 9*diag[i]; 136f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 137ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 138f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 139f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 140f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 141f1af5d2fSBarry Smith v += 9; 142f1af5d2fSBarry Smith 143f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 144f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 145f1af5d2fSBarry Smith while (nz--) { 146f1af5d2fSBarry Smith oidx = 3*(*vi++); 147f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 148f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 149f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 150f1af5d2fSBarry Smith v += 9; 151f1af5d2fSBarry Smith } 152f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 153f1af5d2fSBarry Smith idx += 3; 154f1af5d2fSBarry Smith } 155f1af5d2fSBarry Smith /* backward solve the L^T */ 156f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 157f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 158f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 159f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 160f1af5d2fSBarry Smith idt = 3*i; 161f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 162f1af5d2fSBarry Smith while (nz--) { 163f1af5d2fSBarry Smith idx = 3*(*vi--); 164f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 165f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 166f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 167f1af5d2fSBarry Smith v -= 9; 168f1af5d2fSBarry Smith } 169f1af5d2fSBarry Smith } 1701ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1711ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 172dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 173f1af5d2fSBarry Smith PetscFunctionReturn(0); 174f1af5d2fSBarry Smith } 175f1af5d2fSBarry Smith 1764a2ae208SSatish Balay #undef __FUNCT__ 1774a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 178dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 179f1af5d2fSBarry Smith { 180f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 181dfbe8321SBarry Smith PetscErrorCode ierr; 182690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 183690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 184f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 18687828ca2SBarry Smith PetscScalar *x,*b; 187f1af5d2fSBarry Smith 188f1af5d2fSBarry Smith PetscFunctionBegin; 189ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1901ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192f1af5d2fSBarry Smith 193f1af5d2fSBarry Smith /* forward solve the U^T */ 194f1af5d2fSBarry Smith idx = 0; 195f1af5d2fSBarry Smith for (i=0; i<n; i++) { 196f1af5d2fSBarry Smith 197f1af5d2fSBarry Smith v = aa + 16*diag[i]; 198f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 199ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 200f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 201f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 202f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 203f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 204f1af5d2fSBarry Smith v += 16; 205f1af5d2fSBarry Smith 206f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 207f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 208f1af5d2fSBarry Smith while (nz--) { 209f1af5d2fSBarry Smith oidx = 4*(*vi++); 210f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 211f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 212f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 213f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 214f1af5d2fSBarry Smith v += 16; 215f1af5d2fSBarry Smith } 216f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 217f1af5d2fSBarry Smith idx += 4; 218f1af5d2fSBarry Smith } 219f1af5d2fSBarry Smith /* backward solve the L^T */ 220f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 221f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 222f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 223f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 224f1af5d2fSBarry Smith idt = 4*i; 225f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 226f1af5d2fSBarry Smith while (nz--) { 227f1af5d2fSBarry Smith idx = 4*(*vi--); 228f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 229f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 230f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 231f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 232f1af5d2fSBarry Smith v -= 16; 233f1af5d2fSBarry Smith } 234f1af5d2fSBarry Smith } 2351ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2361ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 237dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 238f1af5d2fSBarry Smith PetscFunctionReturn(0); 239f1af5d2fSBarry Smith } 240f1af5d2fSBarry Smith 2414a2ae208SSatish Balay #undef __FUNCT__ 2424a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 243dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 244f1af5d2fSBarry Smith { 245f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 246dfbe8321SBarry Smith PetscErrorCode ierr; 247690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 248690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 249f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 25087828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 25187828ca2SBarry Smith PetscScalar *x,*b; 252f1af5d2fSBarry Smith 253f1af5d2fSBarry Smith PetscFunctionBegin; 254ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2551ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2561ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 257f1af5d2fSBarry Smith 258f1af5d2fSBarry Smith /* forward solve the U^T */ 259f1af5d2fSBarry Smith idx = 0; 260f1af5d2fSBarry Smith for (i=0; i<n; i++) { 261f1af5d2fSBarry Smith 262f1af5d2fSBarry Smith v = aa + 25*diag[i]; 263f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 264ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 265f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 266f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 267f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 268f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 269f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 270f1af5d2fSBarry Smith v += 25; 271f1af5d2fSBarry Smith 272f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 273f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 274f1af5d2fSBarry Smith while (nz--) { 275f1af5d2fSBarry Smith oidx = 5*(*vi++); 276f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 277f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 278f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 279f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 280f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 281f1af5d2fSBarry Smith v += 25; 282f1af5d2fSBarry Smith } 283f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 284f1af5d2fSBarry Smith idx += 5; 285f1af5d2fSBarry Smith } 286f1af5d2fSBarry Smith /* backward solve the L^T */ 287f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 288f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 289f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 290f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 291f1af5d2fSBarry Smith idt = 5*i; 292f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 293f1af5d2fSBarry Smith while (nz--) { 294f1af5d2fSBarry Smith idx = 5*(*vi--); 295f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 296f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 297f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 298f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 299f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 300f1af5d2fSBarry Smith v -= 25; 301f1af5d2fSBarry Smith } 302f1af5d2fSBarry Smith } 3031ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3041ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 305dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 306f1af5d2fSBarry Smith PetscFunctionReturn(0); 307f1af5d2fSBarry Smith } 308f1af5d2fSBarry Smith 3094a2ae208SSatish Balay #undef __FUNCT__ 3104a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 311dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 312f1af5d2fSBarry Smith { 313f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 314dfbe8321SBarry Smith PetscErrorCode ierr; 315690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 316690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 317f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 31887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 31987828ca2SBarry Smith PetscScalar *x,*b; 320f1af5d2fSBarry Smith 321f1af5d2fSBarry Smith PetscFunctionBegin; 322ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3231ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3241ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 325f1af5d2fSBarry Smith 326f1af5d2fSBarry Smith /* forward solve the U^T */ 327f1af5d2fSBarry Smith idx = 0; 328f1af5d2fSBarry Smith for (i=0; i<n; i++) { 329f1af5d2fSBarry Smith 330f1af5d2fSBarry Smith v = aa + 36*diag[i]; 331f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 332ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 333ef66eb69SBarry Smith x6 = x[5+idx]; 334f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 335f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 336f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 337f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 338f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 339f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 340f1af5d2fSBarry Smith v += 36; 341f1af5d2fSBarry Smith 342f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 343f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 344f1af5d2fSBarry Smith while (nz--) { 345f1af5d2fSBarry Smith oidx = 6*(*vi++); 346f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 347f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 348f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 349f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 350f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 351f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 352f1af5d2fSBarry Smith v += 36; 353f1af5d2fSBarry Smith } 354f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 355f1af5d2fSBarry Smith x[5+idx] = s6; 356f1af5d2fSBarry Smith idx += 6; 357f1af5d2fSBarry Smith } 358f1af5d2fSBarry Smith /* backward solve the L^T */ 359f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 360f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 361f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 362f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 363f1af5d2fSBarry Smith idt = 6*i; 364f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 365f1af5d2fSBarry Smith s6 = x[5+idt]; 366f1af5d2fSBarry Smith while (nz--) { 367f1af5d2fSBarry Smith idx = 6*(*vi--); 368f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 369f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 370f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 371f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 372f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 373f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 374f1af5d2fSBarry Smith v -= 36; 375f1af5d2fSBarry Smith } 376f1af5d2fSBarry Smith } 3771ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3781ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 379dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 380f1af5d2fSBarry Smith PetscFunctionReturn(0); 381f1af5d2fSBarry Smith } 382f1af5d2fSBarry Smith 3834a2ae208SSatish Balay #undef __FUNCT__ 3844a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 385dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 386f1af5d2fSBarry Smith { 387f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 388dfbe8321SBarry Smith PetscErrorCode ierr; 389690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 390690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 391f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 39287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 39387828ca2SBarry Smith PetscScalar *x,*b; 394f1af5d2fSBarry Smith 395f1af5d2fSBarry Smith PetscFunctionBegin; 396ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3971ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3981ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 399f1af5d2fSBarry Smith 400f1af5d2fSBarry Smith /* forward solve the U^T */ 401f1af5d2fSBarry Smith idx = 0; 402f1af5d2fSBarry Smith for (i=0; i<n; i++) { 403f1af5d2fSBarry Smith 404f1af5d2fSBarry Smith v = aa + 49*diag[i]; 405f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 406ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 407ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 408f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 409f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 410f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 411f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 412f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 413f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 414f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 415f1af5d2fSBarry Smith v += 49; 416f1af5d2fSBarry Smith 417f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 418f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 419f1af5d2fSBarry Smith while (nz--) { 420f1af5d2fSBarry Smith oidx = 7*(*vi++); 421f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 422f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 423f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 424f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 425f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 426f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 427f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 428f1af5d2fSBarry Smith v += 49; 429f1af5d2fSBarry Smith } 430f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 431f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 432f1af5d2fSBarry Smith idx += 7; 433f1af5d2fSBarry Smith } 434f1af5d2fSBarry Smith /* backward solve the L^T */ 435f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 436f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 437f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 438f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 439f1af5d2fSBarry Smith idt = 7*i; 440f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 441f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 442f1af5d2fSBarry Smith while (nz--) { 443f1af5d2fSBarry Smith idx = 7*(*vi--); 444f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 445f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 446f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 447f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 448f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 449f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 450f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 451f1af5d2fSBarry Smith v -= 49; 452f1af5d2fSBarry Smith } 453f1af5d2fSBarry Smith } 4541ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4551ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 456dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 457f1af5d2fSBarry Smith PetscFunctionReturn(0); 458f1af5d2fSBarry Smith } 459f1af5d2fSBarry Smith 460f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 4614a2ae208SSatish Balay #undef __FUNCT__ 4624a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 463dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 464f1af5d2fSBarry Smith { 465f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 466f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 4676849ba73SBarry Smith PetscErrorCode ierr; 4685d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 4695d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 470690b6cddSBarry Smith PetscInt *diag = a->diag; 471f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 47287828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 473f1af5d2fSBarry Smith 474f1af5d2fSBarry Smith PetscFunctionBegin; 4751ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4761ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 477f1af5d2fSBarry Smith t = a->solve_work; 478f1af5d2fSBarry Smith 479f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 480f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 481f1af5d2fSBarry Smith 482f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 483f1af5d2fSBarry Smith for (i=0; i<n; i++) { 484f1af5d2fSBarry Smith t[i] = b[c[i]]; 485f1af5d2fSBarry Smith } 486f1af5d2fSBarry Smith 487f1af5d2fSBarry Smith /* forward solve the U^T */ 488f1af5d2fSBarry Smith for (i=0; i<n; i++) { 489f1af5d2fSBarry Smith 490f1af5d2fSBarry Smith v = aa + diag[i]; 491f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 492f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 493f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 494f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 495f1af5d2fSBarry Smith while (nz--) { 496f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 497f1af5d2fSBarry Smith } 498f1af5d2fSBarry Smith t[i] = s1; 499f1af5d2fSBarry Smith } 500f1af5d2fSBarry Smith /* backward solve the L^T */ 501f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 502f1af5d2fSBarry Smith v = aa + diag[i] - 1; 503f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 504f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 505f1af5d2fSBarry Smith s1 = t[i]; 506f1af5d2fSBarry Smith while (nz--) { 507f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 508f1af5d2fSBarry Smith } 509f1af5d2fSBarry Smith } 510f1af5d2fSBarry Smith 511f1af5d2fSBarry Smith /* copy t into x according to permutation */ 512f1af5d2fSBarry Smith for (i=0; i<n; i++) { 513f1af5d2fSBarry Smith x[r[i]] = t[i]; 514f1af5d2fSBarry Smith } 515f1af5d2fSBarry Smith 516f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 517f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5181ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5191ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 520dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 521f1af5d2fSBarry Smith PetscFunctionReturn(0); 522f1af5d2fSBarry Smith } 523f1af5d2fSBarry Smith 5244a2ae208SSatish Balay #undef __FUNCT__ 5254a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 526dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 527f1af5d2fSBarry Smith { 528f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 529f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5306849ba73SBarry Smith PetscErrorCode ierr; 5315d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 5325d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 533690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 534f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53587828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 53687828ca2SBarry Smith PetscScalar *x,*b,*t; 537f1af5d2fSBarry Smith 538f1af5d2fSBarry Smith PetscFunctionBegin; 5391ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5401ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 541f1af5d2fSBarry Smith t = a->solve_work; 542f1af5d2fSBarry Smith 543f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 544f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 545f1af5d2fSBarry Smith 546f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 547f1af5d2fSBarry Smith ii = 0; 548f1af5d2fSBarry Smith for (i=0; i<n; i++) { 549f1af5d2fSBarry Smith ic = 2*c[i]; 550f1af5d2fSBarry Smith t[ii] = b[ic]; 551f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 552f1af5d2fSBarry Smith ii += 2; 553f1af5d2fSBarry Smith } 554f1af5d2fSBarry Smith 555f1af5d2fSBarry Smith /* forward solve the U^T */ 556f1af5d2fSBarry Smith idx = 0; 557f1af5d2fSBarry Smith for (i=0; i<n; i++) { 558f1af5d2fSBarry Smith 559f1af5d2fSBarry Smith v = aa + 4*diag[i]; 560f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 561f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 562f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 563f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 564f1af5d2fSBarry Smith v += 4; 565f1af5d2fSBarry Smith 566f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 567f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 568f1af5d2fSBarry Smith while (nz--) { 569f1af5d2fSBarry Smith oidx = 2*(*vi++); 570f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 571f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 572f1af5d2fSBarry Smith v += 4; 573f1af5d2fSBarry Smith } 574f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 575f1af5d2fSBarry Smith idx += 2; 576f1af5d2fSBarry Smith } 577f1af5d2fSBarry Smith /* backward solve the L^T */ 578f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 579f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 580f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 581f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 582f1af5d2fSBarry Smith idt = 2*i; 583f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 584f1af5d2fSBarry Smith while (nz--) { 585f1af5d2fSBarry Smith idx = 2*(*vi--); 586f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 587f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 588f1af5d2fSBarry Smith v -= 4; 589f1af5d2fSBarry Smith } 590f1af5d2fSBarry Smith } 591f1af5d2fSBarry Smith 592f1af5d2fSBarry Smith /* copy t into x according to permutation */ 593f1af5d2fSBarry Smith ii = 0; 594f1af5d2fSBarry Smith for (i=0; i<n; i++) { 595f1af5d2fSBarry Smith ir = 2*r[i]; 596f1af5d2fSBarry Smith x[ir] = t[ii]; 597f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 598f1af5d2fSBarry Smith ii += 2; 599f1af5d2fSBarry Smith } 600f1af5d2fSBarry Smith 601f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 602f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6031ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6041ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 605dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 606f1af5d2fSBarry Smith PetscFunctionReturn(0); 607f1af5d2fSBarry Smith } 608f1af5d2fSBarry Smith 6094a2ae208SSatish Balay #undef __FUNCT__ 6104a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 611dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 612f1af5d2fSBarry Smith { 613f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 614f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 6156849ba73SBarry Smith PetscErrorCode ierr; 6165d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 6175d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 618690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 619f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 62087828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 62187828ca2SBarry Smith PetscScalar *x,*b,*t; 622f1af5d2fSBarry Smith 623f1af5d2fSBarry Smith PetscFunctionBegin; 6241ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6251ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 626f1af5d2fSBarry Smith t = a->solve_work; 627f1af5d2fSBarry Smith 628f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 629f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 630f1af5d2fSBarry Smith 631f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 632f1af5d2fSBarry Smith ii = 0; 633f1af5d2fSBarry Smith for (i=0; i<n; i++) { 634f1af5d2fSBarry Smith ic = 3*c[i]; 635f1af5d2fSBarry Smith t[ii] = b[ic]; 636f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 637f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 638f1af5d2fSBarry Smith ii += 3; 639f1af5d2fSBarry Smith } 640f1af5d2fSBarry Smith 641f1af5d2fSBarry Smith /* forward solve the U^T */ 642f1af5d2fSBarry Smith idx = 0; 643f1af5d2fSBarry Smith for (i=0; i<n; i++) { 644f1af5d2fSBarry Smith 645f1af5d2fSBarry Smith v = aa + 9*diag[i]; 646f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 647f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 648f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 649f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 650f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 651f1af5d2fSBarry Smith v += 9; 652f1af5d2fSBarry Smith 653f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 654f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 655f1af5d2fSBarry Smith while (nz--) { 656f1af5d2fSBarry Smith oidx = 3*(*vi++); 657f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 658f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 659f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 660f1af5d2fSBarry Smith v += 9; 661f1af5d2fSBarry Smith } 662f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 663f1af5d2fSBarry Smith idx += 3; 664f1af5d2fSBarry Smith } 665f1af5d2fSBarry Smith /* backward solve the L^T */ 666f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 667f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 668f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 669f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 670f1af5d2fSBarry Smith idt = 3*i; 671f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 672f1af5d2fSBarry Smith while (nz--) { 673f1af5d2fSBarry Smith idx = 3*(*vi--); 674f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 675f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 676f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 677f1af5d2fSBarry Smith v -= 9; 678f1af5d2fSBarry Smith } 679f1af5d2fSBarry Smith } 680f1af5d2fSBarry Smith 681f1af5d2fSBarry Smith /* copy t into x according to permutation */ 682f1af5d2fSBarry Smith ii = 0; 683f1af5d2fSBarry Smith for (i=0; i<n; i++) { 684f1af5d2fSBarry Smith ir = 3*r[i]; 685f1af5d2fSBarry Smith x[ir] = t[ii]; 686f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 687f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 688f1af5d2fSBarry Smith ii += 3; 689f1af5d2fSBarry Smith } 690f1af5d2fSBarry Smith 691f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 692f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6931ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6941ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 695dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 696f1af5d2fSBarry Smith PetscFunctionReturn(0); 697f1af5d2fSBarry Smith } 698f1af5d2fSBarry Smith 6994a2ae208SSatish Balay #undef __FUNCT__ 7004a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 701dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 702f1af5d2fSBarry Smith { 703f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 704f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7056849ba73SBarry Smith PetscErrorCode ierr; 7065d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 7075d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 708690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 709f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 71087828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 71187828ca2SBarry Smith PetscScalar *x,*b,*t; 712f1af5d2fSBarry Smith 713f1af5d2fSBarry Smith PetscFunctionBegin; 7141ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7151ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 716f1af5d2fSBarry Smith t = a->solve_work; 717f1af5d2fSBarry Smith 718f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 719f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 720f1af5d2fSBarry Smith 721f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 722f1af5d2fSBarry Smith ii = 0; 723f1af5d2fSBarry Smith for (i=0; i<n; i++) { 724f1af5d2fSBarry Smith ic = 4*c[i]; 725f1af5d2fSBarry Smith t[ii] = b[ic]; 726f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 727f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 728f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 729f1af5d2fSBarry Smith ii += 4; 730f1af5d2fSBarry Smith } 731f1af5d2fSBarry Smith 732f1af5d2fSBarry Smith /* forward solve the U^T */ 733f1af5d2fSBarry Smith idx = 0; 734f1af5d2fSBarry Smith for (i=0; i<n; i++) { 735f1af5d2fSBarry Smith 736f1af5d2fSBarry Smith v = aa + 16*diag[i]; 737f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 738f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 739f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 740f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 741f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 742f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 743f1af5d2fSBarry Smith v += 16; 744f1af5d2fSBarry Smith 745f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 746f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 747f1af5d2fSBarry Smith while (nz--) { 748f1af5d2fSBarry Smith oidx = 4*(*vi++); 749f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 750f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 751f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 752f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 753f1af5d2fSBarry Smith v += 16; 754f1af5d2fSBarry Smith } 755f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 756f1af5d2fSBarry Smith idx += 4; 757f1af5d2fSBarry Smith } 758f1af5d2fSBarry Smith /* backward solve the L^T */ 759f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 760f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 761f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 762f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 763f1af5d2fSBarry Smith idt = 4*i; 764f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 765f1af5d2fSBarry Smith while (nz--) { 766f1af5d2fSBarry Smith idx = 4*(*vi--); 767f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 768f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 769f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 770f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 771f1af5d2fSBarry Smith v -= 16; 772f1af5d2fSBarry Smith } 773f1af5d2fSBarry Smith } 774f1af5d2fSBarry Smith 775f1af5d2fSBarry Smith /* copy t into x according to permutation */ 776f1af5d2fSBarry Smith ii = 0; 777f1af5d2fSBarry Smith for (i=0; i<n; i++) { 778f1af5d2fSBarry Smith ir = 4*r[i]; 779f1af5d2fSBarry Smith x[ir] = t[ii]; 780f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 781f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 782f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 783f1af5d2fSBarry Smith ii += 4; 784f1af5d2fSBarry Smith } 785f1af5d2fSBarry Smith 786f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 787f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7881ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7891ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 790dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 791f1af5d2fSBarry Smith PetscFunctionReturn(0); 792f1af5d2fSBarry Smith } 793f1af5d2fSBarry Smith 7944a2ae208SSatish Balay #undef __FUNCT__ 7954a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 796dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 797f1af5d2fSBarry Smith { 798f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 799f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8006849ba73SBarry Smith PetscErrorCode ierr; 8015d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8025d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 803690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 804f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 80587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 80687828ca2SBarry Smith PetscScalar *x,*b,*t; 807f1af5d2fSBarry Smith 808f1af5d2fSBarry Smith PetscFunctionBegin; 8091ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8101ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 811f1af5d2fSBarry Smith t = a->solve_work; 812f1af5d2fSBarry Smith 813f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 814f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 815f1af5d2fSBarry Smith 816f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 817f1af5d2fSBarry Smith ii = 0; 818f1af5d2fSBarry Smith for (i=0; i<n; i++) { 819f1af5d2fSBarry Smith ic = 5*c[i]; 820f1af5d2fSBarry Smith t[ii] = b[ic]; 821f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 822f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 823f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 824f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 825f1af5d2fSBarry Smith ii += 5; 826f1af5d2fSBarry Smith } 827f1af5d2fSBarry Smith 828f1af5d2fSBarry Smith /* forward solve the U^T */ 829f1af5d2fSBarry Smith idx = 0; 830f1af5d2fSBarry Smith for (i=0; i<n; i++) { 831f1af5d2fSBarry Smith 832f1af5d2fSBarry Smith v = aa + 25*diag[i]; 833f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 834f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 835f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 836f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 837f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 838f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 839f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 840f1af5d2fSBarry Smith v += 25; 841f1af5d2fSBarry Smith 842f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 843f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 844f1af5d2fSBarry Smith while (nz--) { 845f1af5d2fSBarry Smith oidx = 5*(*vi++); 846f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 847f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 848f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 849f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 850f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 851f1af5d2fSBarry Smith v += 25; 852f1af5d2fSBarry Smith } 853f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 854f1af5d2fSBarry Smith idx += 5; 855f1af5d2fSBarry Smith } 856f1af5d2fSBarry Smith /* backward solve the L^T */ 857f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 858f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 859f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 860f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 861f1af5d2fSBarry Smith idt = 5*i; 862f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 863f1af5d2fSBarry Smith while (nz--) { 864f1af5d2fSBarry Smith idx = 5*(*vi--); 865f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 866f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 867f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 868f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 869f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 870f1af5d2fSBarry Smith v -= 25; 871f1af5d2fSBarry Smith } 872f1af5d2fSBarry Smith } 873f1af5d2fSBarry Smith 874f1af5d2fSBarry Smith /* copy t into x according to permutation */ 875f1af5d2fSBarry Smith ii = 0; 876f1af5d2fSBarry Smith for (i=0; i<n; i++) { 877f1af5d2fSBarry Smith ir = 5*r[i]; 878f1af5d2fSBarry Smith x[ir] = t[ii]; 879f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 880f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 881f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 882f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 883f1af5d2fSBarry Smith ii += 5; 884f1af5d2fSBarry Smith } 885f1af5d2fSBarry Smith 886f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 887f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8881ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8891ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 890dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 891f1af5d2fSBarry Smith PetscFunctionReturn(0); 892f1af5d2fSBarry Smith } 893f1af5d2fSBarry Smith 8944a2ae208SSatish Balay #undef __FUNCT__ 8954a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 896dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 897f1af5d2fSBarry Smith { 898f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 899f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9006849ba73SBarry Smith PetscErrorCode ierr; 9015d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9025d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 903690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 904f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 90587828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 90687828ca2SBarry Smith PetscScalar *x,*b,*t; 907f1af5d2fSBarry Smith 908f1af5d2fSBarry Smith PetscFunctionBegin; 9091ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9101ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 911f1af5d2fSBarry Smith t = a->solve_work; 912f1af5d2fSBarry Smith 913f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 914f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 915f1af5d2fSBarry Smith 916f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 917f1af5d2fSBarry Smith ii = 0; 918f1af5d2fSBarry Smith for (i=0; i<n; i++) { 919f1af5d2fSBarry Smith ic = 6*c[i]; 920f1af5d2fSBarry Smith t[ii] = b[ic]; 921f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 922f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 923f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 924f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 925f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 926f1af5d2fSBarry Smith ii += 6; 927f1af5d2fSBarry Smith } 928f1af5d2fSBarry Smith 929f1af5d2fSBarry Smith /* forward solve the U^T */ 930f1af5d2fSBarry Smith idx = 0; 931f1af5d2fSBarry Smith for (i=0; i<n; i++) { 932f1af5d2fSBarry Smith 933f1af5d2fSBarry Smith v = aa + 36*diag[i]; 934f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 935f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 936f1af5d2fSBarry Smith x6 = t[5+idx]; 937f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 938f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 939f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 940f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 941f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 942f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 943f1af5d2fSBarry Smith v += 36; 944f1af5d2fSBarry Smith 945f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 946f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 947f1af5d2fSBarry Smith while (nz--) { 948f1af5d2fSBarry Smith oidx = 6*(*vi++); 949f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 950f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 951f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 952f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 953f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 954f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 955f1af5d2fSBarry Smith v += 36; 956f1af5d2fSBarry Smith } 957f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 958f1af5d2fSBarry Smith t[5+idx] = s6; 959f1af5d2fSBarry Smith idx += 6; 960f1af5d2fSBarry Smith } 961f1af5d2fSBarry Smith /* backward solve the L^T */ 962f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 963f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 964f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 965f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 966f1af5d2fSBarry Smith idt = 6*i; 967f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 968f1af5d2fSBarry Smith s6 = t[5+idt]; 969f1af5d2fSBarry Smith while (nz--) { 970f1af5d2fSBarry Smith idx = 6*(*vi--); 971f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 972f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 973f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 974f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 975f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 976f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 977f1af5d2fSBarry Smith v -= 36; 978f1af5d2fSBarry Smith } 979f1af5d2fSBarry Smith } 980f1af5d2fSBarry Smith 981f1af5d2fSBarry Smith /* copy t into x according to permutation */ 982f1af5d2fSBarry Smith ii = 0; 983f1af5d2fSBarry Smith for (i=0; i<n; i++) { 984f1af5d2fSBarry Smith ir = 6*r[i]; 985f1af5d2fSBarry Smith x[ir] = t[ii]; 986f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 987f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 988f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 989f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 990f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 991f1af5d2fSBarry Smith ii += 6; 992f1af5d2fSBarry Smith } 993f1af5d2fSBarry Smith 994f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 995f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9961ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 9971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 998dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 999f1af5d2fSBarry Smith PetscFunctionReturn(0); 1000f1af5d2fSBarry Smith } 1001f1af5d2fSBarry Smith 10024a2ae208SSatish Balay #undef __FUNCT__ 10034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1004dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1005f1af5d2fSBarry Smith { 1006f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1007f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10086849ba73SBarry Smith PetscErrorCode ierr; 10095d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 10105d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1011690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1012f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 101387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 101487828ca2SBarry Smith PetscScalar *x,*b,*t; 1015f1af5d2fSBarry Smith 1016f1af5d2fSBarry Smith PetscFunctionBegin; 10171ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1019f1af5d2fSBarry Smith t = a->solve_work; 1020f1af5d2fSBarry Smith 1021f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1022f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1023f1af5d2fSBarry Smith 1024f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1025f1af5d2fSBarry Smith ii = 0; 1026f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1027f1af5d2fSBarry Smith ic = 7*c[i]; 1028f1af5d2fSBarry Smith t[ii] = b[ic]; 1029f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1030f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1031f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1032f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1033f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1034f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1035f1af5d2fSBarry Smith ii += 7; 1036f1af5d2fSBarry Smith } 1037f1af5d2fSBarry Smith 1038f1af5d2fSBarry Smith /* forward solve the U^T */ 1039f1af5d2fSBarry Smith idx = 0; 1040f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1041f1af5d2fSBarry Smith 1042f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1043f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1044f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1045f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1046f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1047f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1048f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1049f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1050f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1051f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1052f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1053f1af5d2fSBarry Smith v += 49; 1054f1af5d2fSBarry Smith 1055f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1056f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1057f1af5d2fSBarry Smith while (nz--) { 1058f1af5d2fSBarry Smith oidx = 7*(*vi++); 1059f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1060f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1061f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1062f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1063f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1064f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1065f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1066f1af5d2fSBarry Smith v += 49; 1067f1af5d2fSBarry Smith } 1068f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1069f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1070f1af5d2fSBarry Smith idx += 7; 1071f1af5d2fSBarry Smith } 1072f1af5d2fSBarry Smith /* backward solve the L^T */ 1073f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1074f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1075f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1076f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1077f1af5d2fSBarry Smith idt = 7*i; 1078f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1079f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1080f1af5d2fSBarry Smith while (nz--) { 1081f1af5d2fSBarry Smith idx = 7*(*vi--); 1082f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1083f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1084f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1085f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1086f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1087f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1088f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1089f1af5d2fSBarry Smith v -= 49; 1090f1af5d2fSBarry Smith } 1091f1af5d2fSBarry Smith } 1092f1af5d2fSBarry Smith 1093f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1094f1af5d2fSBarry Smith ii = 0; 1095f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1096f1af5d2fSBarry Smith ir = 7*r[i]; 1097f1af5d2fSBarry Smith x[ir] = t[ii]; 1098f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1099f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1100f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1101f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1102f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1103f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1104f1af5d2fSBarry Smith ii += 7; 1105f1af5d2fSBarry Smith } 1106f1af5d2fSBarry Smith 1107f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1108f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11091ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11101ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1111dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1112f1af5d2fSBarry Smith PetscFunctionReturn(0); 1113f1af5d2fSBarry Smith } 1114f1af5d2fSBarry Smith 11154e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11164a2ae208SSatish Balay #undef __FUNCT__ 11174a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1118dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11194e2b4712SSatish Balay { 11204e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11214e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11226849ba73SBarry Smith PetscErrorCode ierr; 11235d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 11245d0c19d7SBarry Smith PetscInt i,n=a->mbs; 11255d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 11263f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 112787828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11284e2b4712SSatish Balay 11294e2b4712SSatish Balay PetscFunctionBegin; 11301ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11311ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1132f1af5d2fSBarry Smith t = a->solve_work; 11334e2b4712SSatish Balay 11344e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11354e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11364e2b4712SSatish Balay 11374e2b4712SSatish Balay /* forward solve the lower triangular */ 113887828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11394e2b4712SSatish Balay for (i=1; i<n; i++) { 11404e2b4712SSatish Balay v = aa + bs2*ai[i]; 11414e2b4712SSatish Balay vi = aj + ai[i]; 11424e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1143f1af5d2fSBarry Smith s = t + bs*i; 114487828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11454e2b4712SSatish Balay while (nz--) { 1146f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 11474e2b4712SSatish Balay v += bs2; 11484e2b4712SSatish Balay } 11494e2b4712SSatish Balay } 11504e2b4712SSatish Balay /* backward solve the upper triangular */ 1151d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 11524e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 11534e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 11544e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 11554e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 115687828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11574e2b4712SSatish Balay while (nz--) { 1158f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 11594e2b4712SSatish Balay v += bs2; 11604e2b4712SSatish Balay } 1161f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 116287828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 11634e2b4712SSatish Balay } 11644e2b4712SSatish Balay 11654e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 11664e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11671ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11681ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1169dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 11704e2b4712SSatish Balay PetscFunctionReturn(0); 11714e2b4712SSatish Balay } 11724e2b4712SSatish Balay 11734a2ae208SSatish Balay #undef __FUNCT__ 11744a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1175dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 11764e2b4712SSatish Balay { 11774e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11784e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11796849ba73SBarry Smith PetscErrorCode ierr; 11805d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 11815d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 11823f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 118387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 118487828ca2SBarry Smith PetscScalar *x,*b,*t; 11854e2b4712SSatish Balay 11864e2b4712SSatish Balay PetscFunctionBegin; 11871ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11881ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1189f1af5d2fSBarry Smith t = a->solve_work; 11904e2b4712SSatish Balay 11914e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11924e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11934e2b4712SSatish Balay 11944e2b4712SSatish Balay /* forward solve the lower triangular */ 11954e2b4712SSatish Balay idx = 7*(*r++); 1196f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1197f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1198f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 11994e2b4712SSatish Balay 12004e2b4712SSatish Balay for (i=1; i<n; i++) { 12014e2b4712SSatish Balay v = aa + 49*ai[i]; 12024e2b4712SSatish Balay vi = aj + ai[i]; 12034e2b4712SSatish Balay nz = diag[i] - ai[i]; 12044e2b4712SSatish Balay idx = 7*(*r++); 1205f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1206f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 12074e2b4712SSatish Balay while (nz--) { 12084e2b4712SSatish Balay idx = 7*(*vi++); 1209f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1210f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1211f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1212f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1213f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1214f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1215f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1216f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1217f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1218f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12194e2b4712SSatish Balay v += 49; 12204e2b4712SSatish Balay } 12214e2b4712SSatish Balay idx = 7*i; 1222f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1223f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1224f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12254e2b4712SSatish Balay } 12264e2b4712SSatish Balay /* backward solve the upper triangular */ 12274e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12284e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 12294e2b4712SSatish Balay vi = aj + diag[i] + 1; 12304e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12314e2b4712SSatish Balay idt = 7*i; 1232f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1233f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1234f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12354e2b4712SSatish Balay while (nz--) { 12364e2b4712SSatish Balay idx = 7*(*vi++); 1237f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1238f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1239f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1240f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1241f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1242f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1243f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1244f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1245f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1246f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12474e2b4712SSatish Balay v += 49; 12484e2b4712SSatish Balay } 12494e2b4712SSatish Balay idc = 7*(*c--); 12504e2b4712SSatish Balay v = aa + 49*diag[i]; 1251f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1252f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1253f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1254f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1255f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1256f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1257f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1258f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1259f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1260f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1261f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1262f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1263f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1264f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 12654e2b4712SSatish Balay } 12664e2b4712SSatish Balay 12674e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12684e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12691ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 12701ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1271dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 12724e2b4712SSatish Balay PetscFunctionReturn(0); 12734e2b4712SSatish Balay } 12744e2b4712SSatish Balay 12754a2ae208SSatish Balay #undef __FUNCT__ 12764a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1277dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 127815091d37SBarry Smith { 127915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1280690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1281dfbe8321SBarry Smith PetscErrorCode ierr; 1282690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1283d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1284d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1285d9fead3dSBarry Smith const PetscScalar *b; 128615091d37SBarry Smith 128715091d37SBarry Smith PetscFunctionBegin; 1288d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 12891ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 129015091d37SBarry Smith /* forward solve the lower triangular */ 129115091d37SBarry Smith idx = 0; 129215091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 129315091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 129415091d37SBarry Smith x[6] = b[6+idx]; 129515091d37SBarry Smith for (i=1; i<n; i++) { 129615091d37SBarry Smith v = aa + 49*ai[i]; 129715091d37SBarry Smith vi = aj + ai[i]; 129815091d37SBarry Smith nz = diag[i] - ai[i]; 129915091d37SBarry Smith idx = 7*i; 1300f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1301f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1302f1af5d2fSBarry Smith s7 = b[6+idx]; 130315091d37SBarry Smith while (nz--) { 130415091d37SBarry Smith jdx = 7*(*vi++); 130515091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 130615091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 130715091d37SBarry Smith x7 = x[6+jdx]; 1308f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1309f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1310f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1311f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1312f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1313f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1314f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 131515091d37SBarry Smith v += 49; 131615091d37SBarry Smith } 1317f1af5d2fSBarry Smith x[idx] = s1; 1318f1af5d2fSBarry Smith x[1+idx] = s2; 1319f1af5d2fSBarry Smith x[2+idx] = s3; 1320f1af5d2fSBarry Smith x[3+idx] = s4; 1321f1af5d2fSBarry Smith x[4+idx] = s5; 1322f1af5d2fSBarry Smith x[5+idx] = s6; 1323f1af5d2fSBarry Smith x[6+idx] = s7; 132415091d37SBarry Smith } 132515091d37SBarry Smith /* backward solve the upper triangular */ 132615091d37SBarry Smith for (i=n-1; i>=0; i--){ 132715091d37SBarry Smith v = aa + 49*diag[i] + 49; 132815091d37SBarry Smith vi = aj + diag[i] + 1; 132915091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 133015091d37SBarry Smith idt = 7*i; 1331f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1332f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1333f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1334f1af5d2fSBarry Smith s7 = x[6+idt]; 133515091d37SBarry Smith while (nz--) { 133615091d37SBarry Smith idx = 7*(*vi++); 133715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 133815091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 133915091d37SBarry Smith x7 = x[6+idx]; 1340f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1341f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1342f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1343f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1344f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1345f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1346f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 134715091d37SBarry Smith v += 49; 134815091d37SBarry Smith } 134915091d37SBarry Smith v = aa + 49*diag[i]; 1350f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1351f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1352f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1353f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1354f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1355f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1356f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1357f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1358f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1359f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1360f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1361f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1362f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1363f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 136415091d37SBarry Smith } 136515091d37SBarry Smith 1366d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13671ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1368dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 136915091d37SBarry Smith PetscFunctionReturn(0); 137015091d37SBarry Smith } 137115091d37SBarry Smith 13724a2ae208SSatish Balay #undef __FUNCT__ 13734a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1374dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 137515091d37SBarry Smith { 137615091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 137715091d37SBarry Smith IS iscol=a->col,isrow=a->row; 13786849ba73SBarry Smith PetscErrorCode ierr; 13795d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 13805d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1381d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1382d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1383d9fead3dSBarry Smith const PetscScalar *b; 138415091d37SBarry Smith PetscFunctionBegin; 1385d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13861ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1387f1af5d2fSBarry Smith t = a->solve_work; 138815091d37SBarry Smith 138915091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 139015091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 139115091d37SBarry Smith 139215091d37SBarry Smith /* forward solve the lower triangular */ 139315091d37SBarry Smith idx = 6*(*r++); 1394f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1395f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1396f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 139715091d37SBarry Smith for (i=1; i<n; i++) { 139815091d37SBarry Smith v = aa + 36*ai[i]; 139915091d37SBarry Smith vi = aj + ai[i]; 140015091d37SBarry Smith nz = diag[i] - ai[i]; 140115091d37SBarry Smith idx = 6*(*r++); 1402f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1403f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 140415091d37SBarry Smith while (nz--) { 140515091d37SBarry Smith idx = 6*(*vi++); 1406f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1407f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1408f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1409f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1410f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1411f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1412f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1413f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 141415091d37SBarry Smith v += 36; 141515091d37SBarry Smith } 141615091d37SBarry Smith idx = 6*i; 1417f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1418f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1419f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 142015091d37SBarry Smith } 142115091d37SBarry Smith /* backward solve the upper triangular */ 142215091d37SBarry Smith for (i=n-1; i>=0; i--){ 142315091d37SBarry Smith v = aa + 36*diag[i] + 36; 142415091d37SBarry Smith vi = aj + diag[i] + 1; 142515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 142615091d37SBarry Smith idt = 6*i; 1427f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1428f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1429f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 143015091d37SBarry Smith while (nz--) { 143115091d37SBarry Smith idx = 6*(*vi++); 1432f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1433f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1434f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1435f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1436f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1437f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1438f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1439f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1440f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 144115091d37SBarry Smith v += 36; 144215091d37SBarry Smith } 144315091d37SBarry Smith idc = 6*(*c--); 144415091d37SBarry Smith v = aa + 36*diag[i]; 1445f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1446f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1447f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1448f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1449f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1450f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1451f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1452f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1453f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1454f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1455f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1456f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 145715091d37SBarry Smith } 145815091d37SBarry Smith 145915091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 146015091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1461d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14621ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1463dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 146415091d37SBarry Smith PetscFunctionReturn(0); 146515091d37SBarry Smith } 146615091d37SBarry Smith 14674a2ae208SSatish Balay #undef __FUNCT__ 14684a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1469dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 147015091d37SBarry Smith { 147115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1472690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1473dfbe8321SBarry Smith PetscErrorCode ierr; 1474690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1475d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1476d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1477d9fead3dSBarry Smith const PetscScalar *b; 147815091d37SBarry Smith 147915091d37SBarry Smith PetscFunctionBegin; 1480d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14811ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 148215091d37SBarry Smith /* forward solve the lower triangular */ 148315091d37SBarry Smith idx = 0; 148415091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 148515091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 148615091d37SBarry Smith for (i=1; i<n; i++) { 148715091d37SBarry Smith v = aa + 36*ai[i]; 148815091d37SBarry Smith vi = aj + ai[i]; 148915091d37SBarry Smith nz = diag[i] - ai[i]; 149015091d37SBarry Smith idx = 6*i; 1491f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1492f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 149315091d37SBarry Smith while (nz--) { 149415091d37SBarry Smith jdx = 6*(*vi++); 149515091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 149615091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1497f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1498f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1499f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1500f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1501f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1502f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 150315091d37SBarry Smith v += 36; 150415091d37SBarry Smith } 1505f1af5d2fSBarry Smith x[idx] = s1; 1506f1af5d2fSBarry Smith x[1+idx] = s2; 1507f1af5d2fSBarry Smith x[2+idx] = s3; 1508f1af5d2fSBarry Smith x[3+idx] = s4; 1509f1af5d2fSBarry Smith x[4+idx] = s5; 1510f1af5d2fSBarry Smith x[5+idx] = s6; 151115091d37SBarry Smith } 151215091d37SBarry Smith /* backward solve the upper triangular */ 151315091d37SBarry Smith for (i=n-1; i>=0; i--){ 151415091d37SBarry Smith v = aa + 36*diag[i] + 36; 151515091d37SBarry Smith vi = aj + diag[i] + 1; 151615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 151715091d37SBarry Smith idt = 6*i; 1518f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1519f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1520f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 152115091d37SBarry Smith while (nz--) { 152215091d37SBarry Smith idx = 6*(*vi++); 152315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 152415091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1525f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1526f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1527f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1528f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1529f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1530f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 153115091d37SBarry Smith v += 36; 153215091d37SBarry Smith } 153315091d37SBarry Smith v = aa + 36*diag[i]; 1534f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1535f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1536f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1537f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1538f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1539f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 154015091d37SBarry Smith } 154115091d37SBarry Smith 1542d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15431ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1544dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 154515091d37SBarry Smith PetscFunctionReturn(0); 154615091d37SBarry Smith } 154715091d37SBarry Smith 15484a2ae208SSatish Balay #undef __FUNCT__ 15494a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1550dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 15514e2b4712SSatish Balay { 15524e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15534e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 15546849ba73SBarry Smith PetscErrorCode ierr; 15555d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 15565d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1557d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1558d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1559d9fead3dSBarry Smith const PetscScalar *b; 15604e2b4712SSatish Balay 15614e2b4712SSatish Balay PetscFunctionBegin; 1562d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15631ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1564f1af5d2fSBarry Smith t = a->solve_work; 15654e2b4712SSatish Balay 15664e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 15674e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 15684e2b4712SSatish Balay 15694e2b4712SSatish Balay /* forward solve the lower triangular */ 15704e2b4712SSatish Balay idx = 5*(*r++); 1571f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1572f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 15734e2b4712SSatish Balay for (i=1; i<n; i++) { 15744e2b4712SSatish Balay v = aa + 25*ai[i]; 15754e2b4712SSatish Balay vi = aj + ai[i]; 15764e2b4712SSatish Balay nz = diag[i] - ai[i]; 15774e2b4712SSatish Balay idx = 5*(*r++); 1578f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1579f1af5d2fSBarry Smith s5 = b[4+idx]; 15804e2b4712SSatish Balay while (nz--) { 15814e2b4712SSatish Balay idx = 5*(*vi++); 1582f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1583f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1584f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1585f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1586f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1587f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1588f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 15894e2b4712SSatish Balay v += 25; 15904e2b4712SSatish Balay } 15914e2b4712SSatish Balay idx = 5*i; 1592f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1593f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 15944e2b4712SSatish Balay } 15954e2b4712SSatish Balay /* backward solve the upper triangular */ 15964e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 15974e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 15984e2b4712SSatish Balay vi = aj + diag[i] + 1; 15994e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 16004e2b4712SSatish Balay idt = 5*i; 1601f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1602f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 16034e2b4712SSatish Balay while (nz--) { 16044e2b4712SSatish Balay idx = 5*(*vi++); 1605f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1606f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1607f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1608f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1609f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1610f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1611f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 16124e2b4712SSatish Balay v += 25; 16134e2b4712SSatish Balay } 16144e2b4712SSatish Balay idc = 5*(*c--); 16154e2b4712SSatish Balay v = aa + 25*diag[i]; 1616f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1617f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 1618f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1619f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 1620f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1621f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 1622f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1623f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 1624f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1625f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 16264e2b4712SSatish Balay } 16274e2b4712SSatish Balay 16284e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 16294e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1630d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16311ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1632dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 16334e2b4712SSatish Balay PetscFunctionReturn(0); 16344e2b4712SSatish Balay } 16354e2b4712SSatish Balay 163684a281e5SHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 163784a281e5SHong Zhang { 163884a281e5SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 163984a281e5SHong Zhang PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 164084a281e5SHong Zhang PetscErrorCode ierr; 164184a281e5SHong Zhang PetscInt jdx; 164284a281e5SHong Zhang const MatScalar *aa=a->a,*v; 164384a281e5SHong Zhang PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 164484a281e5SHong Zhang const PetscScalar *b; 164584a281e5SHong Zhang 164684a281e5SHong Zhang PetscFunctionBegin; 164784a281e5SHong Zhang ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 164884a281e5SHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 164984a281e5SHong Zhang /* forward solve the lower triangular */ 165084a281e5SHong Zhang idx = 0; 165184a281e5SHong Zhang x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 165284a281e5SHong Zhang for (i=1; i<n; i++) { 165384a281e5SHong Zhang v = aa + 25*ai[i]; 165484a281e5SHong Zhang vi = aj + ai[i]; 165584a281e5SHong Zhang nz = ai[i+1] - ai[i]; 165684a281e5SHong Zhang idx = 5*i; 165784a281e5SHong Zhang s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 165884a281e5SHong Zhang while (nz--) { 165984a281e5SHong Zhang jdx = 5*(*vi++); 166084a281e5SHong Zhang x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 166184a281e5SHong Zhang s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 166284a281e5SHong Zhang s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 166384a281e5SHong Zhang s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 166484a281e5SHong Zhang s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 166584a281e5SHong Zhang s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 166684a281e5SHong Zhang v += 25; 166784a281e5SHong Zhang } 166884a281e5SHong Zhang x[idx] = s1; 166984a281e5SHong Zhang x[1+idx] = s2; 167084a281e5SHong Zhang x[2+idx] = s3; 167184a281e5SHong Zhang x[3+idx] = s4; 167284a281e5SHong Zhang x[4+idx] = s5; 167384a281e5SHong Zhang } 167484a281e5SHong Zhang 167584a281e5SHong Zhang /* backward solve the upper triangular */ 167684a281e5SHong Zhang for (i=n-1; i>=0; i--){ 167784a281e5SHong Zhang v = aa + 25*ai[2*n-i]; 167884a281e5SHong Zhang vi = aj + ai[2*n-i]; 167984a281e5SHong Zhang nz = ai[2*n-i +1] - ai[2*n-i]-1; 168084a281e5SHong Zhang idt = 5*i; 168184a281e5SHong Zhang s1 = x[idt]; s2 = x[1+idt]; 168284a281e5SHong Zhang s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 168384a281e5SHong Zhang while (nz--) { 168484a281e5SHong Zhang idx = 5*(*vi++); 168584a281e5SHong Zhang x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 168684a281e5SHong Zhang s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 168784a281e5SHong Zhang s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 168884a281e5SHong Zhang s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 168984a281e5SHong Zhang s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 169084a281e5SHong Zhang s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 169184a281e5SHong Zhang v += 25; 169284a281e5SHong Zhang } 169384a281e5SHong Zhang /* x = inv_diagonal*x */ 169484a281e5SHong Zhang x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 169584a281e5SHong Zhang x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 169684a281e5SHong Zhang x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 169784a281e5SHong Zhang x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 169884a281e5SHong Zhang x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 169984a281e5SHong Zhang } 170084a281e5SHong Zhang 170184a281e5SHong Zhang ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 170284a281e5SHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 170384a281e5SHong Zhang ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 170484a281e5SHong Zhang PetscFunctionReturn(0); 170584a281e5SHong Zhang } 170684a281e5SHong Zhang 17074a2ae208SSatish Balay #undef __FUNCT__ 17084a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1709dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 171015091d37SBarry Smith { 171115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1712690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1713dfbe8321SBarry Smith PetscErrorCode ierr; 1714690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1715d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1716d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1717d9fead3dSBarry Smith const PetscScalar *b; 171815091d37SBarry Smith 171915091d37SBarry Smith PetscFunctionBegin; 1720d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 172215091d37SBarry Smith /* forward solve the lower triangular */ 172315091d37SBarry Smith idx = 0; 172415091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 172515091d37SBarry Smith for (i=1; i<n; i++) { 172615091d37SBarry Smith v = aa + 25*ai[i]; 172715091d37SBarry Smith vi = aj + ai[i]; 172815091d37SBarry Smith nz = diag[i] - ai[i]; 172915091d37SBarry Smith idx = 5*i; 1730f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 173115091d37SBarry Smith while (nz--) { 173215091d37SBarry Smith jdx = 5*(*vi++); 173315091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1734f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1735f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1736f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1737f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1738f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 173915091d37SBarry Smith v += 25; 174015091d37SBarry Smith } 1741f1af5d2fSBarry Smith x[idx] = s1; 1742f1af5d2fSBarry Smith x[1+idx] = s2; 1743f1af5d2fSBarry Smith x[2+idx] = s3; 1744f1af5d2fSBarry Smith x[3+idx] = s4; 1745f1af5d2fSBarry Smith x[4+idx] = s5; 174615091d37SBarry Smith } 174715091d37SBarry Smith /* backward solve the upper triangular */ 174815091d37SBarry Smith for (i=n-1; i>=0; i--){ 174915091d37SBarry Smith v = aa + 25*diag[i] + 25; 175015091d37SBarry Smith vi = aj + diag[i] + 1; 175115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 175215091d37SBarry Smith idt = 5*i; 1753f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1754f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 175515091d37SBarry Smith while (nz--) { 175615091d37SBarry Smith idx = 5*(*vi++); 175715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1758f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1759f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1760f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1761f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1762f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 176315091d37SBarry Smith v += 25; 176415091d37SBarry Smith } 176515091d37SBarry Smith v = aa + 25*diag[i]; 1766f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1767f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1768f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1769f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1770f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 177115091d37SBarry Smith } 177215091d37SBarry Smith 1773d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1775dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 177615091d37SBarry Smith PetscFunctionReturn(0); 177715091d37SBarry Smith } 177815091d37SBarry Smith 17794a2ae208SSatish Balay #undef __FUNCT__ 17804a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1781dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 17824e2b4712SSatish Balay { 17834e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 17844e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 17856849ba73SBarry Smith PetscErrorCode ierr; 17865d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 17875d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1788d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1789d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1790d9fead3dSBarry Smith const PetscScalar *b; 17914e2b4712SSatish Balay 17924e2b4712SSatish Balay PetscFunctionBegin; 1793d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1795f1af5d2fSBarry Smith t = a->solve_work; 17964e2b4712SSatish Balay 17974e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 17984e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 17994e2b4712SSatish Balay 18004e2b4712SSatish Balay /* forward solve the lower triangular */ 18014e2b4712SSatish Balay idx = 4*(*r++); 1802f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1803f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 18044e2b4712SSatish Balay for (i=1; i<n; i++) { 18054e2b4712SSatish Balay v = aa + 16*ai[i]; 18064e2b4712SSatish Balay vi = aj + ai[i]; 18074e2b4712SSatish Balay nz = diag[i] - ai[i]; 18084e2b4712SSatish Balay idx = 4*(*r++); 1809f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 18104e2b4712SSatish Balay while (nz--) { 18114e2b4712SSatish Balay idx = 4*(*vi++); 1812f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1813f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1814f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1815f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1816f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 18174e2b4712SSatish Balay v += 16; 18184e2b4712SSatish Balay } 18194e2b4712SSatish Balay idx = 4*i; 1820f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1821f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 18224e2b4712SSatish Balay } 18234e2b4712SSatish Balay /* backward solve the upper triangular */ 18244e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 18254e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 18264e2b4712SSatish Balay vi = aj + diag[i] + 1; 18274e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 18284e2b4712SSatish Balay idt = 4*i; 1829f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1830f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 18314e2b4712SSatish Balay while (nz--) { 18324e2b4712SSatish Balay idx = 4*(*vi++); 1833f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1834f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1835f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1836f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1837f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1838f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 18394e2b4712SSatish Balay v += 16; 18404e2b4712SSatish Balay } 18414e2b4712SSatish Balay idc = 4*(*c--); 18424e2b4712SSatish Balay v = aa + 16*diag[i]; 1843f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1844f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1845f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1846f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 18474e2b4712SSatish Balay } 18484e2b4712SSatish Balay 18494e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 18504e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1851d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18521ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1853dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 18544e2b4712SSatish Balay PetscFunctionReturn(0); 18554e2b4712SSatish Balay } 1856f26ec98cSKris Buschelman 1857f26ec98cSKris Buschelman #undef __FUNCT__ 1858f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1859dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1860f26ec98cSKris Buschelman { 1861f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1862f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 18636849ba73SBarry Smith PetscErrorCode ierr; 18645d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 18655d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1866d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1867d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 1868d9fead3dSBarry Smith PetscScalar *x; 1869d9fead3dSBarry Smith const PetscScalar *b; 1870f26ec98cSKris Buschelman 1871f26ec98cSKris Buschelman PetscFunctionBegin; 1872d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1874f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 1875f26ec98cSKris Buschelman 1876f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1877f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1878f26ec98cSKris Buschelman 1879f26ec98cSKris Buschelman /* forward solve the lower triangular */ 1880f26ec98cSKris Buschelman idx = 4*(*r++); 1881f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 1882f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 1883f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 1884f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 1885f26ec98cSKris Buschelman for (i=1; i<n; i++) { 1886f26ec98cSKris Buschelman v = aa + 16*ai[i]; 1887f26ec98cSKris Buschelman vi = aj + ai[i]; 1888f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 1889f26ec98cSKris Buschelman idx = 4*(*r++); 1890f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 1891f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 1892f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 1893f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 1894f26ec98cSKris Buschelman while (nz--) { 1895f26ec98cSKris Buschelman idx = 4*(*vi++); 1896f26ec98cSKris Buschelman x1 = t[idx]; 1897f26ec98cSKris Buschelman x2 = t[1+idx]; 1898f26ec98cSKris Buschelman x3 = t[2+idx]; 1899f26ec98cSKris Buschelman x4 = t[3+idx]; 1900f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1901f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1902f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1903f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1904f26ec98cSKris Buschelman v += 16; 1905f26ec98cSKris Buschelman } 1906f26ec98cSKris Buschelman idx = 4*i; 1907f26ec98cSKris Buschelman t[idx] = s1; 1908f26ec98cSKris Buschelman t[1+idx] = s2; 1909f26ec98cSKris Buschelman t[2+idx] = s3; 1910f26ec98cSKris Buschelman t[3+idx] = s4; 1911f26ec98cSKris Buschelman } 1912f26ec98cSKris Buschelman /* backward solve the upper triangular */ 1913f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 1914f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 1915f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 1916f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 1917f26ec98cSKris Buschelman idt = 4*i; 1918f26ec98cSKris Buschelman s1 = t[idt]; 1919f26ec98cSKris Buschelman s2 = t[1+idt]; 1920f26ec98cSKris Buschelman s3 = t[2+idt]; 1921f26ec98cSKris Buschelman s4 = t[3+idt]; 1922f26ec98cSKris Buschelman while (nz--) { 1923f26ec98cSKris Buschelman idx = 4*(*vi++); 1924f26ec98cSKris Buschelman x1 = t[idx]; 1925f26ec98cSKris Buschelman x2 = t[1+idx]; 1926f26ec98cSKris Buschelman x3 = t[2+idx]; 1927f26ec98cSKris Buschelman x4 = t[3+idx]; 1928f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1929f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1930f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1931f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1932f26ec98cSKris Buschelman v += 16; 1933f26ec98cSKris Buschelman } 1934f26ec98cSKris Buschelman idc = 4*(*c--); 1935f26ec98cSKris Buschelman v = aa + 16*diag[i]; 1936f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1937f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1938f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1939f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1940f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 1941f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 1942f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 1943f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 1944f26ec98cSKris Buschelman } 1945f26ec98cSKris Buschelman 1946f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1947f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1948d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19491ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1950dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1951f26ec98cSKris Buschelman PetscFunctionReturn(0); 1952f26ec98cSKris Buschelman } 1953f26ec98cSKris Buschelman 195424c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 195524c233c2SKris Buschelman 195624c233c2SKris Buschelman #include PETSC_HAVE_SSE 195724c233c2SKris Buschelman 195824c233c2SKris Buschelman #undef __FUNCT__ 195924c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1960dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 196124c233c2SKris Buschelman { 196224c233c2SKris Buschelman /* 196324c233c2SKris Buschelman Note: This code uses demotion of double 196424c233c2SKris Buschelman to float when performing the mixed-mode computation. 196524c233c2SKris Buschelman This may not be numerically reasonable for all applications. 196624c233c2SKris Buschelman */ 196724c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 196824c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 19696849ba73SBarry Smith PetscErrorCode ierr; 19705d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 19715d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 197224c233c2SKris Buschelman MatScalar *aa=a->a,*v; 197387828ca2SBarry Smith PetscScalar *x,*b,*t; 197424c233c2SKris Buschelman 197524c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 197624c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 197724c233c2SKris Buschelman unsigned long offset; 197824c233c2SKris Buschelman 197924c233c2SKris Buschelman PetscFunctionBegin; 198024c233c2SKris Buschelman SSE_SCOPE_BEGIN; 198124c233c2SKris Buschelman 198224c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 198324c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 198424c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 198524c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 198624c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 198724c233c2SKris Buschelman 19881ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 19891ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 199024c233c2SKris Buschelman t = a->solve_work; 199124c233c2SKris Buschelman 199224c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 199324c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 199424c233c2SKris Buschelman 199524c233c2SKris Buschelman /* forward solve the lower triangular */ 199624c233c2SKris Buschelman idx = 4*(*r++); 199724c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 199824c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 199924c233c2SKris Buschelman v = aa + 16*ai[1]; 200024c233c2SKris Buschelman 200124c233c2SKris Buschelman for (i=1; i<n;) { 200224c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 200324c233c2SKris Buschelman vi = aj + ai[i]; 200424c233c2SKris Buschelman nz = diag[i] - ai[i]; 200524c233c2SKris Buschelman idx = 4*(*r++); 200624c233c2SKris Buschelman 200724c233c2SKris Buschelman /* Demote sum from double to float */ 200824c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 200924c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 201024c233c2SKris Buschelman 201124c233c2SKris Buschelman while (nz--) { 201224c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 201324c233c2SKris Buschelman idx = 4*(*vi++); 201424c233c2SKris Buschelman 201524c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 201624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 201724c233c2SKris Buschelman 201824c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 201924c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 202024c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 202124c233c2SKris Buschelman 202224c233c2SKris Buschelman /* First Column */ 202324c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 202424c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 202524c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 202624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 202724c233c2SKris Buschelman 202824c233c2SKris Buschelman /* Second Column */ 202924c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 203024c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 203124c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 203224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 203324c233c2SKris Buschelman 203424c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 203524c233c2SKris Buschelman 203624c233c2SKris Buschelman /* Third Column */ 203724c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 203824c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 203924c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 204024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 204124c233c2SKris Buschelman 204224c233c2SKris Buschelman /* Fourth Column */ 204324c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 204424c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 204524c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 204624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 204724c233c2SKris Buschelman SSE_INLINE_END_2 204824c233c2SKris Buschelman 204924c233c2SKris Buschelman v += 16; 205024c233c2SKris Buschelman } 205124c233c2SKris Buschelman idx = 4*i; 205224c233c2SKris Buschelman v = aa + 16*ai[++i]; 205324c233c2SKris Buschelman PREFETCH_NTA(v); 205424c233c2SKris Buschelman STORE_PS(tmps,XMM7); 205524c233c2SKris Buschelman 205624c233c2SKris Buschelman /* Promote result from float to double */ 205724c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 205824c233c2SKris Buschelman } 205924c233c2SKris Buschelman /* backward solve the upper triangular */ 206024c233c2SKris Buschelman idt = 4*(n-1); 206124c233c2SKris Buschelman ai16 = 16*diag[n-1]; 206224c233c2SKris Buschelman v = aa + ai16 + 16; 206324c233c2SKris Buschelman for (i=n-1; i>=0;){ 206424c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 206524c233c2SKris Buschelman vi = aj + diag[i] + 1; 206624c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 206724c233c2SKris Buschelman 206824c233c2SKris Buschelman /* Demote accumulator from double to float */ 206924c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 207024c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 207124c233c2SKris Buschelman 207224c233c2SKris Buschelman while (nz--) { 207324c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 207424c233c2SKris Buschelman idx = 4*(*vi++); 207524c233c2SKris Buschelman 207624c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 207724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 207824c233c2SKris Buschelman 207924c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 208024c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 208124c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 208224c233c2SKris Buschelman 208324c233c2SKris Buschelman /* First Column */ 208424c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 208524c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 208624c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 208724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 208824c233c2SKris Buschelman 208924c233c2SKris Buschelman /* Second Column */ 209024c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 209124c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 209224c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 209324c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 209424c233c2SKris Buschelman 209524c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 209624c233c2SKris Buschelman 209724c233c2SKris Buschelman /* Third Column */ 209824c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 209924c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 210024c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 210124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 210224c233c2SKris Buschelman 210324c233c2SKris Buschelman /* Fourth Column */ 210424c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 210524c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 210624c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 210724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 210824c233c2SKris Buschelman SSE_INLINE_END_2 210924c233c2SKris Buschelman v += 16; 211024c233c2SKris Buschelman } 211124c233c2SKris Buschelman v = aa + ai16; 211224c233c2SKris Buschelman ai16 = 16*diag[--i]; 211324c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 211424c233c2SKris Buschelman /* 211524c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 211624c233c2SKris Buschelman which was inverted as part of the factorization 211724c233c2SKris Buschelman */ 211824c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 211924c233c2SKris Buschelman /* First Column */ 212024c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 212124c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 212224c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 212324c233c2SKris Buschelman 212424c233c2SKris Buschelman /* Second Column */ 212524c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 212624c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 212724c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 212824c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 212924c233c2SKris Buschelman 213024c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 213124c233c2SKris Buschelman 213224c233c2SKris Buschelman /* Third Column */ 213324c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 213424c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 213524c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 213624c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 213724c233c2SKris Buschelman 213824c233c2SKris Buschelman /* Fourth Column */ 213924c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 214024c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 214124c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 214224c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 214324c233c2SKris Buschelman 214424c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 214524c233c2SKris Buschelman SSE_INLINE_END_3 214624c233c2SKris Buschelman 214724c233c2SKris Buschelman /* Promote solution from float to double */ 214824c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 214924c233c2SKris Buschelman 215024c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 215124c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 215224c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 215324c233c2SKris Buschelman idc = 4*(*c--); 215424c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 215524c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 215624c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 215724c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 215824c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 215924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 216024c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 216124c233c2SKris Buschelman SSE_INLINE_END_2 216224c233c2SKris Buschelman v = aa + ai16 + 16; 216324c233c2SKris Buschelman idt -= 4; 216424c233c2SKris Buschelman } 216524c233c2SKris Buschelman 216624c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 216724c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21681ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 21691ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2170dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 217124c233c2SKris Buschelman SSE_SCOPE_END; 217224c233c2SKris Buschelman PetscFunctionReturn(0); 217324c233c2SKris Buschelman } 217424c233c2SKris Buschelman 217524c233c2SKris Buschelman #endif 21760ef38995SBarry Smith 21770ef38995SBarry Smith 21784e2b4712SSatish Balay /* 21794e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 21804e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 21814e2b4712SSatish Balay */ 21824a2ae208SSatish Balay #undef __FUNCT__ 21834a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2184dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 21854e2b4712SSatish Balay { 21864e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2187356650c2SBarry Smith PetscInt n=a->mbs; 2188356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 2189dfbe8321SBarry Smith PetscErrorCode ierr; 2190356650c2SBarry Smith const PetscInt *diag = a->diag; 2191d9fead3dSBarry Smith const MatScalar *aa=a->a; 2192d9fead3dSBarry Smith PetscScalar *x; 2193d9fead3dSBarry Smith const PetscScalar *b; 21944e2b4712SSatish Balay 21954e2b4712SSatish Balay PetscFunctionBegin; 2196d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21971ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 21984e2b4712SSatish Balay 2199aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 22002853dc0eSBarry Smith { 220187828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 22022853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 22032853dc0eSBarry Smith } 2204aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 22052853dc0eSBarry Smith { 220687828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 22072853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 22082853dc0eSBarry Smith } 2209aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 22102853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2211e1293385SBarry Smith #else 221230d4dcafSBarry Smith { 221387828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2214d9fead3dSBarry Smith const MatScalar *v; 2215356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 2216356650c2SBarry Smith const PetscInt *vi; 2217e1293385SBarry Smith 22184e2b4712SSatish Balay /* forward solve the lower triangular */ 22194e2b4712SSatish Balay idx = 0; 2220e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 22214e2b4712SSatish Balay for (i=1; i<n; i++) { 22224e2b4712SSatish Balay v = aa + 16*ai[i]; 22234e2b4712SSatish Balay vi = aj + ai[i]; 22244e2b4712SSatish Balay nz = diag[i] - ai[i]; 2225e1293385SBarry Smith idx += 4; 2226f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 22274e2b4712SSatish Balay while (nz--) { 22284e2b4712SSatish Balay jdx = 4*(*vi++); 22294e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2230f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2231f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2232f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2233f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 22344e2b4712SSatish Balay v += 16; 22354e2b4712SSatish Balay } 2236f1af5d2fSBarry Smith x[idx] = s1; 2237f1af5d2fSBarry Smith x[1+idx] = s2; 2238f1af5d2fSBarry Smith x[2+idx] = s3; 2239f1af5d2fSBarry Smith x[3+idx] = s4; 22404e2b4712SSatish Balay } 22414e2b4712SSatish Balay /* backward solve the upper triangular */ 22424e555682SBarry Smith idt = 4*(n-1); 22434e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 22444e555682SBarry Smith ai16 = 16*diag[i]; 22454e555682SBarry Smith v = aa + ai16 + 16; 22464e2b4712SSatish Balay vi = aj + diag[i] + 1; 22474e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2248f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2249f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 22504e2b4712SSatish Balay while (nz--) { 22514e2b4712SSatish Balay idx = 4*(*vi++); 22524e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2253f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2254f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2255f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2256f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 22574e2b4712SSatish Balay v += 16; 22584e2b4712SSatish Balay } 22594e555682SBarry Smith v = aa + ai16; 2260f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2261f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2262f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2263f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2264329f5518SBarry Smith idt -= 4; 22654e2b4712SSatish Balay } 226630d4dcafSBarry Smith } 2267e1293385SBarry Smith #endif 22684e2b4712SSatish Balay 2269d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22701ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2271dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 22724e2b4712SSatish Balay PetscFunctionReturn(0); 22734e2b4712SSatish Balay } 22744e2b4712SSatish Balay 2275f26ec98cSKris Buschelman #undef __FUNCT__ 2276f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2277dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2278f26ec98cSKris Buschelman { 2279f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2280690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2281dfbe8321SBarry Smith PetscErrorCode ierr; 2282690b6cddSBarry Smith PetscInt *diag = a->diag; 2283f26ec98cSKris Buschelman MatScalar *aa=a->a; 2284f26ec98cSKris Buschelman PetscScalar *x,*b; 2285f26ec98cSKris Buschelman 2286f26ec98cSKris Buschelman PetscFunctionBegin; 22871ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 22881ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2289f26ec98cSKris Buschelman 2290f26ec98cSKris Buschelman { 2291f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2292f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 2293690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2294f26ec98cSKris Buschelman 2295f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2296f26ec98cSKris Buschelman idx = 0; 2297f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 2298f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 2299f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 2300f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 2301f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2302f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2303f26ec98cSKris Buschelman vi = aj + ai[i]; 2304f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2305f26ec98cSKris Buschelman idx += 4; 2306f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2307f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2308f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2309f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2310f26ec98cSKris Buschelman while (nz--) { 2311f26ec98cSKris Buschelman jdx = 4*(*vi++); 2312f26ec98cSKris Buschelman x1 = t[jdx]; 2313f26ec98cSKris Buschelman x2 = t[1+jdx]; 2314f26ec98cSKris Buschelman x3 = t[2+jdx]; 2315f26ec98cSKris Buschelman x4 = t[3+jdx]; 2316f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2317f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2318f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2319f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2320f26ec98cSKris Buschelman v += 16; 2321f26ec98cSKris Buschelman } 2322f26ec98cSKris Buschelman t[idx] = s1; 2323f26ec98cSKris Buschelman t[1+idx] = s2; 2324f26ec98cSKris Buschelman t[2+idx] = s3; 2325f26ec98cSKris Buschelman t[3+idx] = s4; 2326f26ec98cSKris Buschelman } 2327f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2328f26ec98cSKris Buschelman idt = 4*(n-1); 2329f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2330f26ec98cSKris Buschelman ai16 = 16*diag[i]; 2331f26ec98cSKris Buschelman v = aa + ai16 + 16; 2332f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2333f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2334f26ec98cSKris Buschelman s1 = t[idt]; 2335f26ec98cSKris Buschelman s2 = t[1+idt]; 2336f26ec98cSKris Buschelman s3 = t[2+idt]; 2337f26ec98cSKris Buschelman s4 = t[3+idt]; 2338f26ec98cSKris Buschelman while (nz--) { 2339f26ec98cSKris Buschelman idx = 4*(*vi++); 2340f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 2341f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 2342f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 2343f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 2344f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2345f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2346f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2347f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2348f26ec98cSKris Buschelman v += 16; 2349f26ec98cSKris Buschelman } 2350f26ec98cSKris Buschelman v = aa + ai16; 2351f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2352f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2353f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2354f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2355f26ec98cSKris Buschelman idt -= 4; 2356f26ec98cSKris Buschelman } 2357f26ec98cSKris Buschelman } 2358f26ec98cSKris Buschelman 23591ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 23601ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2361dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2362f26ec98cSKris Buschelman PetscFunctionReturn(0); 2363f26ec98cSKris Buschelman } 2364f26ec98cSKris Buschelman 23653660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 23663660e330SKris Buschelman 23673660e330SKris Buschelman #include PETSC_HAVE_SSE 23683660e330SKris Buschelman #undef __FUNCT__ 23697cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2370dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 23713660e330SKris Buschelman { 23723660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 23732aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 2374dfbe8321SBarry Smith PetscErrorCode ierr; 2375dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 23763660e330SKris Buschelman MatScalar *aa=a->a; 237787828ca2SBarry Smith PetscScalar *x,*b; 23783660e330SKris Buschelman 23793660e330SKris Buschelman PetscFunctionBegin; 23803660e330SKris Buschelman SSE_SCOPE_BEGIN; 23813660e330SKris Buschelman /* 23823660e330SKris Buschelman Note: This code currently uses demotion of double 23833660e330SKris Buschelman to float when performing the mixed-mode computation. 23843660e330SKris Buschelman This may not be numerically reasonable for all applications. 23853660e330SKris Buschelman */ 23863660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 23873660e330SKris Buschelman 23881ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 23891ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23903660e330SKris Buschelman { 2391eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 2392eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 23932aa5897fSKris Buschelman int nz,i,idt,ai16; 23942aa5897fSKris Buschelman unsigned int jdx,idx; 23952aa5897fSKris Buschelman unsigned short *vi; 2396eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 23973660e330SKris Buschelman 2398eb05f457SKris Buschelman /* First block is the identity. */ 23993660e330SKris Buschelman idx = 0; 2400eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 24012aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 24023660e330SKris Buschelman 24033660e330SKris Buschelman for (i=1; i<n;) { 24043660e330SKris Buschelman PREFETCH_NTA(&v[8]); 24053660e330SKris Buschelman vi = aj + ai[i]; 24063660e330SKris Buschelman nz = diag[i] - ai[i]; 24073660e330SKris Buschelman idx += 4; 24083660e330SKris Buschelman 2409eb05f457SKris Buschelman /* Demote RHS from double to float. */ 2410eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2411eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 24123660e330SKris Buschelman 24133660e330SKris Buschelman while (nz--) { 24143660e330SKris Buschelman PREFETCH_NTA(&v[16]); 24152aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 24163660e330SKris Buschelman 24173660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 2418eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 24193660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 24203660e330SKris Buschelman 24213660e330SKris Buschelman /* First Column */ 24223660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 24233660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 24243660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 24253660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 24263660e330SKris Buschelman 24273660e330SKris Buschelman /* Second Column */ 24283660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 24293660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 24303660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 24313660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 24323660e330SKris Buschelman 24333660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 24343660e330SKris Buschelman 24353660e330SKris Buschelman /* Third Column */ 24363660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 24373660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 24383660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 24393660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 24403660e330SKris Buschelman 24413660e330SKris Buschelman /* Fourth Column */ 24423660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 24433660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 24443660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 24453660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 24463660e330SKris Buschelman SSE_INLINE_END_2 24473660e330SKris Buschelman 24483660e330SKris Buschelman v += 16; 24493660e330SKris Buschelman } 24503660e330SKris Buschelman v = aa + 16*ai[++i]; 24513660e330SKris Buschelman PREFETCH_NTA(v); 2452eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 24533660e330SKris Buschelman } 2454eb05f457SKris Buschelman 2455eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 2456eb05f457SKris Buschelman 24573660e330SKris Buschelman idt = 4*(n-1); 24583660e330SKris Buschelman ai16 = 16*diag[n-1]; 24593660e330SKris Buschelman v = aa + ai16 + 16; 24603660e330SKris Buschelman for (i=n-1; i>=0;){ 24613660e330SKris Buschelman PREFETCH_NTA(&v[8]); 24623660e330SKris Buschelman vi = aj + diag[i] + 1; 24633660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 24643660e330SKris Buschelman 2465eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 24663660e330SKris Buschelman 24673660e330SKris Buschelman while (nz--) { 24683660e330SKris Buschelman PREFETCH_NTA(&v[16]); 24692aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 24703660e330SKris Buschelman 24713660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 2472eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 24733660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 24743660e330SKris Buschelman 24753660e330SKris Buschelman /* First Column */ 24763660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 24773660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 24783660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 24793660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 24803660e330SKris Buschelman 24813660e330SKris Buschelman /* Second Column */ 24823660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 24833660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 24843660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 24853660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 24863660e330SKris Buschelman 24873660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 24883660e330SKris Buschelman 24893660e330SKris Buschelman /* Third Column */ 24903660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 24913660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 24923660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 24933660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 24943660e330SKris Buschelman 24953660e330SKris Buschelman /* Fourth Column */ 24963660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 24973660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 24983660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 24993660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 25003660e330SKris Buschelman SSE_INLINE_END_2 25013660e330SKris Buschelman v += 16; 25023660e330SKris Buschelman } 25033660e330SKris Buschelman v = aa + ai16; 25043660e330SKris Buschelman ai16 = 16*diag[--i]; 25053660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 25063660e330SKris Buschelman /* 25073660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 25083660e330SKris Buschelman which was inverted as part of the factorization 25093660e330SKris Buschelman */ 2510eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 25113660e330SKris Buschelman /* First Column */ 25123660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 25133660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25143660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 25153660e330SKris Buschelman 25163660e330SKris Buschelman /* Second Column */ 25173660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 25183660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 25193660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 25203660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 25213660e330SKris Buschelman 25223660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 25233660e330SKris Buschelman 25243660e330SKris Buschelman /* Third Column */ 25253660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 25263660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 25273660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 25283660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 25293660e330SKris Buschelman 25303660e330SKris Buschelman /* Fourth Column */ 25313660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 25323660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 25333660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 25343660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 25353660e330SKris Buschelman 25363660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 25373660e330SKris Buschelman SSE_INLINE_END_3 25383660e330SKris Buschelman 25393660e330SKris Buschelman v = aa + ai16 + 16; 25403660e330SKris Buschelman idt -= 4; 25413660e330SKris Buschelman } 2542eb05f457SKris Buschelman 2543eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 2544eb05f457SKris Buschelman idt = 4*(n-1); 2545eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 2546eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2547eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2548eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 2549eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 2550eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 2551eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 2552eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 2553eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 255454693613SKris Buschelman idt -= 4; 25553660e330SKris Buschelman } 2556eb05f457SKris Buschelman 2557eb05f457SKris Buschelman } /* End of artificial scope. */ 25581ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 25591ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2560dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 25613660e330SKris Buschelman SSE_SCOPE_END; 25623660e330SKris Buschelman PetscFunctionReturn(0); 25633660e330SKris Buschelman } 25643660e330SKris Buschelman 25657cf1b8d3SKris Buschelman #undef __FUNCT__ 25667cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2567dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 25687cf1b8d3SKris Buschelman { 25697cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 25707cf1b8d3SKris Buschelman int *aj=a->j; 2571dfbe8321SBarry Smith PetscErrorCode ierr; 2572dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 25737cf1b8d3SKris Buschelman MatScalar *aa=a->a; 25747cf1b8d3SKris Buschelman PetscScalar *x,*b; 25757cf1b8d3SKris Buschelman 25767cf1b8d3SKris Buschelman PetscFunctionBegin; 25777cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 25787cf1b8d3SKris Buschelman /* 25797cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 25807cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 25817cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 25827cf1b8d3SKris Buschelman */ 25837cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 25847cf1b8d3SKris Buschelman 25851ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25861ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25877cf1b8d3SKris Buschelman { 25887cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 25897cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 25907cf1b8d3SKris Buschelman int nz,i,idt,ai16; 25917cf1b8d3SKris Buschelman int jdx,idx; 25927cf1b8d3SKris Buschelman int *vi; 25937cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 25947cf1b8d3SKris Buschelman 25957cf1b8d3SKris Buschelman /* First block is the identity. */ 25967cf1b8d3SKris Buschelman idx = 0; 25977cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 25987cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 25997cf1b8d3SKris Buschelman 26007cf1b8d3SKris Buschelman for (i=1; i<n;) { 26017cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 26027cf1b8d3SKris Buschelman vi = aj + ai[i]; 26037cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 26047cf1b8d3SKris Buschelman idx += 4; 26057cf1b8d3SKris Buschelman 26067cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 26077cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 26087cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 26097cf1b8d3SKris Buschelman 26107cf1b8d3SKris Buschelman while (nz--) { 26117cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 26127cf1b8d3SKris Buschelman jdx = 4*(*vi++); 26137cf1b8d3SKris Buschelman /* jdx = *vi++; */ 26147cf1b8d3SKris Buschelman 26157cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 26167cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 26177cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 26187cf1b8d3SKris Buschelman 26197cf1b8d3SKris Buschelman /* First Column */ 26207cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 26217cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 26227cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 26237cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 26247cf1b8d3SKris Buschelman 26257cf1b8d3SKris Buschelman /* Second Column */ 26267cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 26277cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 26287cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 26297cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 26307cf1b8d3SKris Buschelman 26317cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 26327cf1b8d3SKris Buschelman 26337cf1b8d3SKris Buschelman /* Third Column */ 26347cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 26357cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 26367cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 26377cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 26387cf1b8d3SKris Buschelman 26397cf1b8d3SKris Buschelman /* Fourth Column */ 26407cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 26417cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 26427cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 26437cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 26447cf1b8d3SKris Buschelman SSE_INLINE_END_2 26457cf1b8d3SKris Buschelman 26467cf1b8d3SKris Buschelman v += 16; 26477cf1b8d3SKris Buschelman } 26487cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 26497cf1b8d3SKris Buschelman PREFETCH_NTA(v); 26507cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 26517cf1b8d3SKris Buschelman } 26527cf1b8d3SKris Buschelman 26537cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 26547cf1b8d3SKris Buschelman 26557cf1b8d3SKris Buschelman idt = 4*(n-1); 26567cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 26577cf1b8d3SKris Buschelman v = aa + ai16 + 16; 26587cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 26597cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 26607cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 26617cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 26627cf1b8d3SKris Buschelman 26637cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 26647cf1b8d3SKris Buschelman 26657cf1b8d3SKris Buschelman while (nz--) { 26667cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 26677cf1b8d3SKris Buschelman idx = 4*(*vi++); 26687cf1b8d3SKris Buschelman /* idx = *vi++; */ 26697cf1b8d3SKris Buschelman 26707cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 26717cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 26727cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 26737cf1b8d3SKris Buschelman 26747cf1b8d3SKris Buschelman /* First Column */ 26757cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 26767cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 26777cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 26787cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 26797cf1b8d3SKris Buschelman 26807cf1b8d3SKris Buschelman /* Second Column */ 26817cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 26827cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 26837cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 26847cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 26857cf1b8d3SKris Buschelman 26867cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 26877cf1b8d3SKris Buschelman 26887cf1b8d3SKris Buschelman /* Third Column */ 26897cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 26907cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 26917cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 26927cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 26937cf1b8d3SKris Buschelman 26947cf1b8d3SKris Buschelman /* Fourth Column */ 26957cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 26967cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 26977cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 26987cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 26997cf1b8d3SKris Buschelman SSE_INLINE_END_2 27007cf1b8d3SKris Buschelman v += 16; 27017cf1b8d3SKris Buschelman } 27027cf1b8d3SKris Buschelman v = aa + ai16; 27037cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 27047cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 27057cf1b8d3SKris Buschelman /* 27067cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 27077cf1b8d3SKris Buschelman which was inverted as part of the factorization 27087cf1b8d3SKris Buschelman */ 27097cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 27107cf1b8d3SKris Buschelman /* First Column */ 27117cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 27127cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 27137cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 27147cf1b8d3SKris Buschelman 27157cf1b8d3SKris Buschelman /* Second Column */ 27167cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 27177cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 27187cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 27197cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 27207cf1b8d3SKris Buschelman 27217cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 27227cf1b8d3SKris Buschelman 27237cf1b8d3SKris Buschelman /* Third Column */ 27247cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 27257cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 27267cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 27277cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 27287cf1b8d3SKris Buschelman 27297cf1b8d3SKris Buschelman /* Fourth Column */ 27307cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 27317cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 27327cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 27337cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 27347cf1b8d3SKris Buschelman 27357cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 27367cf1b8d3SKris Buschelman SSE_INLINE_END_3 27377cf1b8d3SKris Buschelman 27387cf1b8d3SKris Buschelman v = aa + ai16 + 16; 27397cf1b8d3SKris Buschelman idt -= 4; 27407cf1b8d3SKris Buschelman } 27417cf1b8d3SKris Buschelman 27427cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 27437cf1b8d3SKris Buschelman idt = 4*(n-1); 27447cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 27457cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 27467cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 27477cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 27487cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 27497cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 27507cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 27517cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 27527cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 27537cf1b8d3SKris Buschelman idt -= 4; 27547cf1b8d3SKris Buschelman } 27557cf1b8d3SKris Buschelman 27567cf1b8d3SKris Buschelman } /* End of artificial scope. */ 27571ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 27581ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2759dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 27607cf1b8d3SKris Buschelman SSE_SCOPE_END; 27617cf1b8d3SKris Buschelman PetscFunctionReturn(0); 27627cf1b8d3SKris Buschelman } 27637cf1b8d3SKris Buschelman 27643660e330SKris Buschelman #endif 27654a2ae208SSatish Balay #undef __FUNCT__ 27664a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2767dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 27684e2b4712SSatish Balay { 27694e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 27704e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 27716849ba73SBarry Smith PetscErrorCode ierr; 27725d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 27735d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2774d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2775d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 2776d9fead3dSBarry Smith const PetscScalar *b; 27774e2b4712SSatish Balay 27784e2b4712SSatish Balay PetscFunctionBegin; 2779d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27801ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2781f1af5d2fSBarry Smith t = a->solve_work; 27824e2b4712SSatish Balay 27834e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 27844e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 27854e2b4712SSatish Balay 27864e2b4712SSatish Balay /* forward solve the lower triangular */ 27874e2b4712SSatish Balay idx = 3*(*r++); 2788f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 27894e2b4712SSatish Balay for (i=1; i<n; i++) { 27904e2b4712SSatish Balay v = aa + 9*ai[i]; 27914e2b4712SSatish Balay vi = aj + ai[i]; 27924e2b4712SSatish Balay nz = diag[i] - ai[i]; 27934e2b4712SSatish Balay idx = 3*(*r++); 2794f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 27954e2b4712SSatish Balay while (nz--) { 27964e2b4712SSatish Balay idx = 3*(*vi++); 2797f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2798f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2799f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2800f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 28014e2b4712SSatish Balay v += 9; 28024e2b4712SSatish Balay } 28034e2b4712SSatish Balay idx = 3*i; 2804f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 28054e2b4712SSatish Balay } 28064e2b4712SSatish Balay /* backward solve the upper triangular */ 28074e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 28084e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 28094e2b4712SSatish Balay vi = aj + diag[i] + 1; 28104e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 28114e2b4712SSatish Balay idt = 3*i; 2812f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 28134e2b4712SSatish Balay while (nz--) { 28144e2b4712SSatish Balay idx = 3*(*vi++); 2815f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2816f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2817f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2818f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 28194e2b4712SSatish Balay v += 9; 28204e2b4712SSatish Balay } 28214e2b4712SSatish Balay idc = 3*(*c--); 28224e2b4712SSatish Balay v = aa + 9*diag[i]; 2823f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2824f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2825f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 28264e2b4712SSatish Balay } 28274e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 28284e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2829d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28301ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2831dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 28324e2b4712SSatish Balay PetscFunctionReturn(0); 28334e2b4712SSatish Balay } 28344e2b4712SSatish Balay 283515091d37SBarry Smith /* 283615091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 283715091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 283815091d37SBarry Smith */ 28394a2ae208SSatish Balay #undef __FUNCT__ 28404a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2841dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 284215091d37SBarry Smith { 284315091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2844690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2845dfbe8321SBarry Smith PetscErrorCode ierr; 2846690b6cddSBarry Smith PetscInt *diag = a->diag; 2847d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2848d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 2849d9fead3dSBarry Smith const PetscScalar *b; 2850690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 285115091d37SBarry Smith 285215091d37SBarry Smith PetscFunctionBegin; 2853d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28541ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 285515091d37SBarry Smith 285615091d37SBarry Smith /* forward solve the lower triangular */ 285715091d37SBarry Smith idx = 0; 285815091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 285915091d37SBarry Smith for (i=1; i<n; i++) { 286015091d37SBarry Smith v = aa + 9*ai[i]; 286115091d37SBarry Smith vi = aj + ai[i]; 286215091d37SBarry Smith nz = diag[i] - ai[i]; 286315091d37SBarry Smith idx += 3; 2864f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 286515091d37SBarry Smith while (nz--) { 286615091d37SBarry Smith jdx = 3*(*vi++); 286715091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2868f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2869f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2870f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 287115091d37SBarry Smith v += 9; 287215091d37SBarry Smith } 2873f1af5d2fSBarry Smith x[idx] = s1; 2874f1af5d2fSBarry Smith x[1+idx] = s2; 2875f1af5d2fSBarry Smith x[2+idx] = s3; 287615091d37SBarry Smith } 287715091d37SBarry Smith /* backward solve the upper triangular */ 287815091d37SBarry Smith for (i=n-1; i>=0; i--){ 287915091d37SBarry Smith v = aa + 9*diag[i] + 9; 288015091d37SBarry Smith vi = aj + diag[i] + 1; 288115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 288215091d37SBarry Smith idt = 3*i; 2883f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2884f1af5d2fSBarry Smith s3 = x[2+idt]; 288515091d37SBarry Smith while (nz--) { 288615091d37SBarry Smith idx = 3*(*vi++); 288715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2888f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2889f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2890f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 289115091d37SBarry Smith v += 9; 289215091d37SBarry Smith } 289315091d37SBarry Smith v = aa + 9*diag[i]; 2894f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2895f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2896f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 289715091d37SBarry Smith } 289815091d37SBarry Smith 2899d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29001ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2901dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 290215091d37SBarry Smith PetscFunctionReturn(0); 290315091d37SBarry Smith } 290415091d37SBarry Smith 29054a2ae208SSatish Balay #undef __FUNCT__ 29064a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2907dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 29084e2b4712SSatish Balay { 29094e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 29104e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 29116849ba73SBarry Smith PetscErrorCode ierr; 29125d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 29135d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2914d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2915d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 2916d9fead3dSBarry Smith const PetscScalar *b; 29174e2b4712SSatish Balay 29184e2b4712SSatish Balay PetscFunctionBegin; 2919d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29201ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2921f1af5d2fSBarry Smith t = a->solve_work; 29224e2b4712SSatish Balay 29234e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 29244e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 29254e2b4712SSatish Balay 29264e2b4712SSatish Balay /* forward solve the lower triangular */ 29274e2b4712SSatish Balay idx = 2*(*r++); 2928f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 29294e2b4712SSatish Balay for (i=1; i<n; i++) { 29304e2b4712SSatish Balay v = aa + 4*ai[i]; 29314e2b4712SSatish Balay vi = aj + ai[i]; 29324e2b4712SSatish Balay nz = diag[i] - ai[i]; 29334e2b4712SSatish Balay idx = 2*(*r++); 2934f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 29354e2b4712SSatish Balay while (nz--) { 29364e2b4712SSatish Balay idx = 2*(*vi++); 2937f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2938f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2939f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 29404e2b4712SSatish Balay v += 4; 29414e2b4712SSatish Balay } 29424e2b4712SSatish Balay idx = 2*i; 2943f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 29444e2b4712SSatish Balay } 29454e2b4712SSatish Balay /* backward solve the upper triangular */ 29464e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 29474e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 29484e2b4712SSatish Balay vi = aj + diag[i] + 1; 29494e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 29504e2b4712SSatish Balay idt = 2*i; 2951f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 29524e2b4712SSatish Balay while (nz--) { 29534e2b4712SSatish Balay idx = 2*(*vi++); 2954f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2955f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2956f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 29574e2b4712SSatish Balay v += 4; 29584e2b4712SSatish Balay } 29594e2b4712SSatish Balay idc = 2*(*c--); 29604e2b4712SSatish Balay v = aa + 4*diag[i]; 2961f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2962f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 29634e2b4712SSatish Balay } 29644e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 29654e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2966d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29671ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2968dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 29694e2b4712SSatish Balay PetscFunctionReturn(0); 29704e2b4712SSatish Balay } 29714e2b4712SSatish Balay 297215091d37SBarry Smith /* 297315091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 297415091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 297515091d37SBarry Smith */ 29764a2ae208SSatish Balay #undef __FUNCT__ 29774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2978dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 297915091d37SBarry Smith { 298015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2981690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2982dfbe8321SBarry Smith PetscErrorCode ierr; 2983690b6cddSBarry Smith PetscInt *diag = a->diag; 2984d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2985d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 2986d9fead3dSBarry Smith const PetscScalar *b; 2987690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 298815091d37SBarry Smith 298915091d37SBarry Smith PetscFunctionBegin; 2990d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 299215091d37SBarry Smith 299315091d37SBarry Smith /* forward solve the lower triangular */ 299415091d37SBarry Smith idx = 0; 299515091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 299615091d37SBarry Smith for (i=1; i<n; i++) { 299715091d37SBarry Smith v = aa + 4*ai[i]; 299815091d37SBarry Smith vi = aj + ai[i]; 299915091d37SBarry Smith nz = diag[i] - ai[i]; 300015091d37SBarry Smith idx += 2; 3001f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 300215091d37SBarry Smith while (nz--) { 300315091d37SBarry Smith jdx = 2*(*vi++); 300415091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 3005f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3006f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 300715091d37SBarry Smith v += 4; 300815091d37SBarry Smith } 3009f1af5d2fSBarry Smith x[idx] = s1; 3010f1af5d2fSBarry Smith x[1+idx] = s2; 301115091d37SBarry Smith } 301215091d37SBarry Smith /* backward solve the upper triangular */ 301315091d37SBarry Smith for (i=n-1; i>=0; i--){ 301415091d37SBarry Smith v = aa + 4*diag[i] + 4; 301515091d37SBarry Smith vi = aj + diag[i] + 1; 301615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 301715091d37SBarry Smith idt = 2*i; 3018f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 301915091d37SBarry Smith while (nz--) { 302015091d37SBarry Smith idx = 2*(*vi++); 302115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 3022f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3023f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 302415091d37SBarry Smith v += 4; 302515091d37SBarry Smith } 302615091d37SBarry Smith v = aa + 4*diag[i]; 3027f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 3028f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 302915091d37SBarry Smith } 303015091d37SBarry Smith 3031d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30321ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3033dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 303415091d37SBarry Smith PetscFunctionReturn(0); 303515091d37SBarry Smith } 303615091d37SBarry Smith 30374a2ae208SSatish Balay #undef __FUNCT__ 30384a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 3039dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 30404e2b4712SSatish Balay { 30414e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 30424e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 30436849ba73SBarry Smith PetscErrorCode ierr; 30445d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 30455d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 30463f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 304787828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 30484e2b4712SSatish Balay 30494e2b4712SSatish Balay PetscFunctionBegin; 30504e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 30514e2b4712SSatish Balay 30521ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 30531ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3054f1af5d2fSBarry Smith t = a->solve_work; 30554e2b4712SSatish Balay 30564e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 30574e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 30584e2b4712SSatish Balay 30594e2b4712SSatish Balay /* forward solve the lower triangular */ 3060f1af5d2fSBarry Smith t[0] = b[*r++]; 30614e2b4712SSatish Balay for (i=1; i<n; i++) { 30624e2b4712SSatish Balay v = aa + ai[i]; 30634e2b4712SSatish Balay vi = aj + ai[i]; 30644e2b4712SSatish Balay nz = diag[i] - ai[i]; 3065f1af5d2fSBarry Smith s1 = b[*r++]; 30664e2b4712SSatish Balay while (nz--) { 3067f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 30684e2b4712SSatish Balay } 3069f1af5d2fSBarry Smith t[i] = s1; 30704e2b4712SSatish Balay } 30714e2b4712SSatish Balay /* backward solve the upper triangular */ 30724e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 30734e2b4712SSatish Balay v = aa + diag[i] + 1; 30744e2b4712SSatish Balay vi = aj + diag[i] + 1; 30754e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 3076f1af5d2fSBarry Smith s1 = t[i]; 30774e2b4712SSatish Balay while (nz--) { 3078f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 30794e2b4712SSatish Balay } 3080f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 30814e2b4712SSatish Balay } 30824e2b4712SSatish Balay 30834e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 30844e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 30851ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 30861ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3087dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 30884e2b4712SSatish Balay PetscFunctionReturn(0); 30894e2b4712SSatish Balay } 309015091d37SBarry Smith /* 309115091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 309215091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 309315091d37SBarry Smith */ 30944a2ae208SSatish Balay #undef __FUNCT__ 30954a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3096dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 309715091d37SBarry Smith { 309815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3099690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3100dfbe8321SBarry Smith PetscErrorCode ierr; 3101690b6cddSBarry Smith PetscInt *diag = a->diag; 310215091d37SBarry Smith MatScalar *aa=a->a; 310387828ca2SBarry Smith PetscScalar *x,*b; 310487828ca2SBarry Smith PetscScalar s1,x1; 310515091d37SBarry Smith MatScalar *v; 3106690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 310715091d37SBarry Smith 310815091d37SBarry Smith PetscFunctionBegin; 31091ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 31101ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 311115091d37SBarry Smith 311215091d37SBarry Smith /* forward solve the lower triangular */ 311315091d37SBarry Smith idx = 0; 311415091d37SBarry Smith x[0] = b[0]; 311515091d37SBarry Smith for (i=1; i<n; i++) { 311615091d37SBarry Smith v = aa + ai[i]; 311715091d37SBarry Smith vi = aj + ai[i]; 311815091d37SBarry Smith nz = diag[i] - ai[i]; 311915091d37SBarry Smith idx += 1; 3120f1af5d2fSBarry Smith s1 = b[idx]; 312115091d37SBarry Smith while (nz--) { 312215091d37SBarry Smith jdx = *vi++; 312315091d37SBarry Smith x1 = x[jdx]; 3124f1af5d2fSBarry Smith s1 -= v[0]*x1; 312515091d37SBarry Smith v += 1; 312615091d37SBarry Smith } 3127f1af5d2fSBarry Smith x[idx] = s1; 312815091d37SBarry Smith } 312915091d37SBarry Smith /* backward solve the upper triangular */ 313015091d37SBarry Smith for (i=n-1; i>=0; i--){ 313115091d37SBarry Smith v = aa + diag[i] + 1; 313215091d37SBarry Smith vi = aj + diag[i] + 1; 313315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 313415091d37SBarry Smith idt = i; 3135f1af5d2fSBarry Smith s1 = x[idt]; 313615091d37SBarry Smith while (nz--) { 313715091d37SBarry Smith idx = *vi++; 313815091d37SBarry Smith x1 = x[idx]; 3139f1af5d2fSBarry Smith s1 -= v[0]*x1; 314015091d37SBarry Smith v += 1; 314115091d37SBarry Smith } 314215091d37SBarry Smith v = aa + diag[i]; 3143f1af5d2fSBarry Smith x[idt] = v[0]*s1; 314415091d37SBarry Smith } 31451ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 31461ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3147dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 314815091d37SBarry Smith PetscFunctionReturn(0); 314915091d37SBarry Smith } 31504e2b4712SSatish Balay 31514e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 31526bce7ff8SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption); 31536bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 31546bce7ff8SHong Zhang 315584a281e5SHong Zhang extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 31566bce7ff8SHong Zhang #undef __FUNCT__ 31576bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 31586bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 31596bce7ff8SHong Zhang { 31606bce7ff8SHong Zhang Mat C=B; 31616bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 31626bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 31636bce7ff8SHong Zhang PetscErrorCode ierr; 31646bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 31656bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 31666bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 3167914a18a2SHong Zhang MatScalar *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a; 3168914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 3169914a18a2SHong Zhang MatScalar *v_work; 31706bce7ff8SHong Zhang 31716bce7ff8SHong Zhang PetscFunctionBegin; 31726bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 31736bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3174914a18a2SHong Zhang ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 3175914a18a2SHong Zhang ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 31766bce7ff8SHong Zhang ics = ic; 31776bce7ff8SHong Zhang 3178914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 3179914a18a2SHong Zhang ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 3180914a18a2SHong Zhang multiplier = v_work + bs; 3181914a18a2SHong Zhang v_pivots = (PetscInt*)(multiplier + bs2); 3182914a18a2SHong Zhang 31836bce7ff8SHong Zhang for (i=0; i<n; i++){ 31846bce7ff8SHong Zhang /* zero rtmp */ 31856bce7ff8SHong Zhang /* L part */ 31866bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 31876bce7ff8SHong Zhang bjtmp = bj + bi[i]; 3188914a18a2SHong Zhang for (j=0; j<nz; j++){ 3189914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3190914a18a2SHong Zhang } 31916bce7ff8SHong Zhang 31926bce7ff8SHong Zhang /* U part */ 31936bce7ff8SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i]; 31946bce7ff8SHong Zhang bjtmp = bj + bi[2*n-i]; 3195914a18a2SHong Zhang for (j=0; j<nz; j++){ 3196914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3197914a18a2SHong Zhang } 31986bce7ff8SHong Zhang 31996bce7ff8SHong Zhang /* load in initial (unfactored row) */ 32006bce7ff8SHong Zhang nz = ai[r[i]+1] - ai[r[i]]; 32016bce7ff8SHong Zhang ajtmp = aj + ai[r[i]]; 3202914a18a2SHong Zhang v = aa + bs2*ai[r[i]]; 32036bce7ff8SHong Zhang for (j=0; j<nz; j++) { 3204914a18a2SHong Zhang ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 32056bce7ff8SHong Zhang } 32066bce7ff8SHong Zhang 32076bce7ff8SHong Zhang /* elimination */ 32086bce7ff8SHong Zhang bjtmp = bj + bi[i]; 32096bce7ff8SHong Zhang row = *bjtmp++; 32106bce7ff8SHong Zhang nzL = bi[i+1] - bi[i]; 32116bce7ff8SHong Zhang k = 0; 32126bce7ff8SHong Zhang while (k < nzL) { 3213914a18a2SHong Zhang pc = rtmp + bs2*row; 3214914a18a2SHong Zhang for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 3215914a18a2SHong Zhang if (flg) { 3216914a18a2SHong Zhang pv = b->a + bs2*bdiag[row]; 3217914a18a2SHong Zhang Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */ 32186bce7ff8SHong Zhang pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 3219914a18a2SHong Zhang pv = b->a + bs2*bi[2*n-row]; 32206bce7ff8SHong Zhang nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 3221914a18a2SHong Zhang for (j=0; j<nz; j++) { 3222914a18a2SHong Zhang Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 3223914a18a2SHong Zhang } 32246bce7ff8SHong Zhang ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 32256bce7ff8SHong Zhang } 32266bce7ff8SHong Zhang row = *bjtmp++; k++; 32276bce7ff8SHong Zhang } 32286bce7ff8SHong Zhang 32296bce7ff8SHong Zhang /* finished row so stick it into b->a */ 32306bce7ff8SHong Zhang /* L part */ 3231914a18a2SHong Zhang pv = b->a + bs2*bi[i] ; 32326bce7ff8SHong Zhang pj = b->j + bi[i] ; 32336bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 32346bce7ff8SHong Zhang for (j=0; j<nz; j++) { 3235914a18a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 32366bce7ff8SHong Zhang } 32376bce7ff8SHong Zhang 32386bce7ff8SHong Zhang /* Mark diagonal and invert diagonal for simplier triangular solves */ 3239914a18a2SHong Zhang pv = b->a + bs2*bdiag[i]; 32406bce7ff8SHong Zhang pj = b->j + bdiag[i]; 3241914a18a2SHong Zhang /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 3242914a18a2SHong Zhang ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3243914a18a2SHong Zhang ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 32446bce7ff8SHong Zhang 32456bce7ff8SHong Zhang /* U part */ 3246914a18a2SHong Zhang pv = b->a + bs2*bi[2*n-i]; 32476bce7ff8SHong Zhang pj = b->j + bi[2*n-i]; 32486bce7ff8SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i] - 1; 3249914a18a2SHong Zhang for (j=0; j<nz; j++){ 3250914a18a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3251914a18a2SHong Zhang } 32526bce7ff8SHong Zhang } 32536bce7ff8SHong Zhang 32546bce7ff8SHong Zhang ierr = PetscFree(rtmp);CHKERRQ(ierr); 32556bce7ff8SHong Zhang ierr = PetscFree(v_work);CHKERRQ(ierr); 32566bce7ff8SHong Zhang ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 32576bce7ff8SHong Zhang ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 325884a281e5SHong Zhang if (bs == 5){ 325984a281e5SHong Zhang C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 326084a281e5SHong Zhang } else { 326184a281e5SHong Zhang C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 326284a281e5SHong Zhang } 32636bce7ff8SHong Zhang C->assembled = PETSC_TRUE; 3264914a18a2SHong Zhang ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 32656bce7ff8SHong Zhang PetscFunctionReturn(0); 32666bce7ff8SHong Zhang } 32676bce7ff8SHong Zhang 32686bce7ff8SHong Zhang /* 32696bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 32706bce7ff8SHong Zhang Factored arrays bj and ba are stored as 32716bce7ff8SHong Zhang L(0,:), L(1,:), ...,L(n-1,:), U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:) 32726bce7ff8SHong Zhang 32736bce7ff8SHong Zhang bi=fact->i is an array of size 2n+2, in which 32746bce7ff8SHong Zhang bi+ 32756bce7ff8SHong Zhang bi[i] -> 1st entry of L(i,:),i=0,...,i-1 32766bce7ff8SHong Zhang bi[n] -> end of L(n-1,:)+1 32776bce7ff8SHong Zhang bi[n+1] -> 1st entry of U(n-1,:) 32786bce7ff8SHong Zhang bi[2n-i] -> 1st entry of U(i,:) 32796bce7ff8SHong Zhang bi[2n-i+1] -> end of U(i,:)+1, the 1st entry of U(i-1,:) 32806bce7ff8SHong Zhang bi[2n] -> end of U(0,:)+1 32816bce7ff8SHong Zhang 32826bce7ff8SHong Zhang U(i,:) contains diag[i] as its last entry, i.e., 32836bce7ff8SHong Zhang U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i]) 32846bce7ff8SHong Zhang */ 32856bce7ff8SHong Zhang #undef __FUNCT__ 32866bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 32876bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 32886bce7ff8SHong Zhang { 32896bce7ff8SHong Zhang 32906bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 32916bce7ff8SHong Zhang PetscErrorCode ierr; 3292914a18a2SHong Zhang PetscInt mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 32936bce7ff8SHong Zhang PetscInt i,j,nz=a->nz,*bi,*bj,*bdiag; 32946bce7ff8SHong Zhang 32956bce7ff8SHong Zhang PetscFunctionBegin; 32966bce7ff8SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 32976bce7ff8SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 3298914a18a2SHong Zhang bdiag = b->diag; 32996bce7ff8SHong Zhang 33006bce7ff8SHong Zhang /* replace matrix arrays with single allocations, then reset values */ 33016bce7ff8SHong Zhang ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 33026bce7ff8SHong Zhang 33036bce7ff8SHong Zhang ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr); 33046bce7ff8SHong Zhang ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr); 33056bce7ff8SHong Zhang ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr); 33066bce7ff8SHong Zhang b->singlemalloc = PETSC_FALSE; 33076bce7ff8SHong Zhang if (mbs > 0) { 33086bce7ff8SHong Zhang ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr); 33096bce7ff8SHong Zhang } 33106bce7ff8SHong Zhang 33116bce7ff8SHong Zhang /* set bi and bj with new data structure */ 33126bce7ff8SHong Zhang bi = b->i; 33136bce7ff8SHong Zhang bj = b->j; 33146bce7ff8SHong Zhang 33156bce7ff8SHong Zhang /* L part */ 33166bce7ff8SHong Zhang bi[0] = 0; 33176bce7ff8SHong Zhang for (i=0; i<mbs; i++){ 33186bce7ff8SHong Zhang nz = adiag[i] - ai[i]; 3319914a18a2SHong Zhang bi[i+1] = bi[i] + nz; 33206bce7ff8SHong Zhang aj = a->j + ai[i]; 33216bce7ff8SHong Zhang for (j=0; j<nz; j++){ 33226bce7ff8SHong Zhang *bj = aj[j]; bj++; 33236bce7ff8SHong Zhang } 33246bce7ff8SHong Zhang } 33256bce7ff8SHong Zhang 33266bce7ff8SHong Zhang /* U part */ 33276bce7ff8SHong Zhang bi[mbs+1] = bi[mbs]; 33286bce7ff8SHong Zhang for (i=mbs-1; i>=0; i--){ 33296bce7ff8SHong Zhang nz = ai[i+1] - adiag[i] - 1; 33306bce7ff8SHong Zhang if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz); 3331914a18a2SHong Zhang bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1; 33326bce7ff8SHong Zhang aj = a->j + adiag[i] + 1; 33336bce7ff8SHong Zhang for (j=0; j<nz; j++){ 33346bce7ff8SHong Zhang *bj = aj[j]; bj++; 33356bce7ff8SHong Zhang } 33366bce7ff8SHong Zhang /* diag[i] */ 33376bce7ff8SHong Zhang *bj = i; bj++; 33386bce7ff8SHong Zhang bdiag[i] = bi[2*mbs-i+1]-1; 33396bce7ff8SHong Zhang } 33406bce7ff8SHong Zhang PetscFunctionReturn(0); 33416bce7ff8SHong Zhang } 33426bce7ff8SHong Zhang 33434e2b4712SSatish Balay /* 33444e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 33454e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 33464e2b4712SSatish Balay Not a good example of code reuse. 33474e2b4712SSatish Balay */ 3348435faa5fSBarry Smith 33494a2ae208SSatish Balay #undef __FUNCT__ 33504a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 33510481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 33524e2b4712SSatish Balay { 33534e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 33544e2b4712SSatish Balay IS isicol; 33556849ba73SBarry Smith PetscErrorCode ierr; 33565d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 33575d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 3358a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3359d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 336041df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 3361329f5518SBarry Smith PetscReal f; 33624e2b4712SSatish Balay 33634e2b4712SSatish Balay PetscFunctionBegin; 33646bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 33656bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 33666bce7ff8SHong Zhang 3367435faa5fSBarry Smith f = info->fill; 3368690b6cddSBarry Smith levels = (PetscInt)info->levels; 3369690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 33704c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3371667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3372667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 33737d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 3374309c388cSBarry Smith 337541df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 33766bce7ff8SHong Zhang 33776bce7ff8SHong Zhang PetscTruth newdatastruct=PETSC_FALSE; 33786bce7ff8SHong Zhang ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 33796bce7ff8SHong Zhang if (newdatastruct){ 33806bce7ff8SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 33816bce7ff8SHong Zhang (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 33826bce7ff8SHong Zhang } else { 3383719d5645SBarry Smith ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 33846bce7ff8SHong Zhang ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 33856bce7ff8SHong Zhang } 33866bce7ff8SHong Zhang 3387719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 3388719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 3389bb3d539aSBarry Smith b->row = isrow; 3390bb3d539aSBarry Smith b->col = iscol; 3391bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3392bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3393bb3d539aSBarry Smith b->icol = isicol; 3394bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3395719d5645SBarry Smith ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 33966bce7ff8SHong Zhang PetscFunctionReturn(0); 33976bce7ff8SHong Zhang } 33986bce7ff8SHong Zhang 33996bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 34004e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 34014e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 34024e2b4712SSatish Balay 34034e2b4712SSatish Balay /* get new row pointers */ 3404690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 34054e2b4712SSatish Balay ainew[0] = 0; 34064e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 3407690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 3408690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 34094e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 3410690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 34114e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 3412690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 34134e2b4712SSatish Balay /* im is level for each filled value */ 3414690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 34154e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 3416690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 34174e2b4712SSatish Balay dloc[0] = 0; 34184e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 3419435faa5fSBarry Smith 3420435faa5fSBarry Smith /* copy prow into linked list */ 34214e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 34223b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 34234e2b4712SSatish Balay xi = aj + ai[r[prow]]; 34244e2b4712SSatish Balay fill[n] = n; 3425435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 34264e2b4712SSatish Balay while (nz--) { 34274e2b4712SSatish Balay fm = n; 34284e2b4712SSatish Balay idx = ic[*xi++]; 34294e2b4712SSatish Balay do { 34304e2b4712SSatish Balay m = fm; 34314e2b4712SSatish Balay fm = fill[m]; 34324e2b4712SSatish Balay } while (fm < idx); 34334e2b4712SSatish Balay fill[m] = idx; 34344e2b4712SSatish Balay fill[idx] = fm; 34354e2b4712SSatish Balay im[idx] = 0; 34364e2b4712SSatish Balay } 3437435faa5fSBarry Smith 3438435faa5fSBarry Smith /* make sure diagonal entry is included */ 3439435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 3440435faa5fSBarry Smith fm = n; 3441435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 3442435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3443435faa5fSBarry Smith fill[fm] = prow; 3444435faa5fSBarry Smith im[prow] = 0; 3445435faa5fSBarry Smith nzf++; 3446335d9088SBarry Smith dcount++; 3447435faa5fSBarry Smith } 3448435faa5fSBarry Smith 34494e2b4712SSatish Balay nzi = 0; 34504e2b4712SSatish Balay row = fill[n]; 34514e2b4712SSatish Balay while (row < prow) { 34524e2b4712SSatish Balay incrlev = im[row] + 1; 34534e2b4712SSatish Balay nz = dloc[row]; 3454435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 34554e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 34564e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 34574e2b4712SSatish Balay fm = row; 34584e2b4712SSatish Balay while (nnz-- > 0) { 34594e2b4712SSatish Balay idx = *xi++; 34604e2b4712SSatish Balay if (*flev + incrlev > levels) { 34614e2b4712SSatish Balay flev++; 34624e2b4712SSatish Balay continue; 34634e2b4712SSatish Balay } 34644e2b4712SSatish Balay do { 34654e2b4712SSatish Balay m = fm; 34664e2b4712SSatish Balay fm = fill[m]; 34674e2b4712SSatish Balay } while (fm < idx); 34684e2b4712SSatish Balay if (fm != idx) { 34694e2b4712SSatish Balay im[idx] = *flev + incrlev; 34704e2b4712SSatish Balay fill[m] = idx; 34714e2b4712SSatish Balay fill[idx] = fm; 34724e2b4712SSatish Balay fm = idx; 34734e2b4712SSatish Balay nzf++; 3474ecf371e4SBarry Smith } else { 34754e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 34764e2b4712SSatish Balay } 34774e2b4712SSatish Balay flev++; 34784e2b4712SSatish Balay } 34794e2b4712SSatish Balay row = fill[row]; 34804e2b4712SSatish Balay nzi++; 34814e2b4712SSatish Balay } 34824e2b4712SSatish Balay /* copy new filled row into permanent storage */ 34834e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 34844e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 3485ecf371e4SBarry Smith 3486ecf371e4SBarry Smith /* estimate how much additional space we will need */ 3487ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3488ecf371e4SBarry Smith /* just double the memory each time */ 3489690b6cddSBarry Smith PetscInt maxadd = jmax; 3490ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 34914e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 34924e2b4712SSatish Balay jmax += maxadd; 3493ecf371e4SBarry Smith 3494ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 34955d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 34965d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3497606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 34985d0c19d7SBarry Smith ajnew = xitmp; 34995d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 35005d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3501606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 35025d0c19d7SBarry Smith ajfill = xitmp; 3503eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 35044e2b4712SSatish Balay } 35055d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 35064e2b4712SSatish Balay flev = ajfill + ainew[prow]; 35074e2b4712SSatish Balay dloc[prow] = nzi; 35084e2b4712SSatish Balay fm = fill[n]; 35094e2b4712SSatish Balay while (nzf--) { 35105d0c19d7SBarry Smith *xitmp++ = fm; 35114e2b4712SSatish Balay *flev++ = im[fm]; 35124e2b4712SSatish Balay fm = fill[fm]; 35134e2b4712SSatish Balay } 3514435faa5fSBarry Smith /* make sure row has diagonal entry */ 3515435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 351677431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 35172401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 3518435faa5fSBarry Smith } 35194e2b4712SSatish Balay } 3520606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 35214e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 35224e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3523606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 3524606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 35254e2b4712SSatish Balay 35266cf91177SBarry Smith #if defined(PETSC_USE_INFO) 35274e2b4712SSatish Balay { 3528329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3529ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 3530ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3531ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 3532ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3533335d9088SBarry Smith if (diagonal_fill) { 3534ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 3535335d9088SBarry Smith } 35364e2b4712SSatish Balay } 353763ba0a88SBarry Smith #endif 35384e2b4712SSatish Balay 35394e2b4712SSatish Balay /* put together the new matrix */ 3540719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3541719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 3542719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 3543e6b907acSBarry Smith b->free_a = PETSC_TRUE; 3544e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 35457c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 3546a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 35474e2b4712SSatish Balay b->j = ajnew; 35484e2b4712SSatish Balay b->i = ainew; 35494e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 35504e2b4712SSatish Balay b->diag = dloc; 35514e2b4712SSatish Balay b->ilen = 0; 35524e2b4712SSatish Balay b->imax = 0; 35534e2b4712SSatish Balay b->row = isrow; 35544e2b4712SSatish Balay b->col = iscol; 3555bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3556c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3557c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3558e51c0b9cSSatish Balay b->icol = isicol; 355987828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 35604e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 35614e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 3562719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 35634e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 35644e2b4712SSatish Balay 3565719d5645SBarry Smith (fact)->info.factor_mallocs = reallocate; 3566719d5645SBarry Smith (fact)->info.fill_ratio_given = f; 3567719d5645SBarry Smith (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 35686bce7ff8SHong Zhang 356941df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 35708661488fSKris Buschelman PetscFunctionReturn(0); 35718661488fSKris Buschelman } 35728661488fSKris Buschelman 3573732ee342SKris Buschelman #undef __FUNCT__ 35747e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3575dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 35767e7071cdSKris Buschelman { 357712272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 357812272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 35795a9542e3SKris Buschelman PetscFunctionBegin; 35807cf1b8d3SKris Buschelman /* Undo Column scaling */ 35817cf1b8d3SKris Buschelman /* while (nz--) { */ 35827cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 35837cf1b8d3SKris Buschelman /* } */ 3584c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 3585c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 35867cf1b8d3SKris Buschelman PetscFunctionReturn(0); 35877cf1b8d3SKris Buschelman } 35887cf1b8d3SKris Buschelman 35897cf1b8d3SKris Buschelman #undef __FUNCT__ 35907cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3591dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 35927cf1b8d3SKris Buschelman { 35937cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3594b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 35952aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 35965a9542e3SKris Buschelman PetscFunctionBegin; 35970b9da03eSKris Buschelman /* Is this really necessary? */ 359820235379SKris Buschelman while (nz--) { 35990b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 36007e7071cdSKris Buschelman } 3601c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 36027e7071cdSKris Buschelman PetscFunctionReturn(0); 36037e7071cdSKris Buschelman } 36047e7071cdSKris Buschelman 3605732ee342SKris Buschelman 3606