1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 3a4005a5dSBarry Smith 44e2b4712SSatish Balay /* 54e2b4712SSatish Balay Factorization code for BAIJ format. 64e2b4712SSatish Balay */ 74e2b4712SSatish Balay 87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 9c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 104e2b4712SSatish Balay 114a2ae208SSatish Balay #undef __FUNCT__ 124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14f1af5d2fSBarry Smith { 15f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16dfbe8321SBarry Smith PetscErrorCode ierr; 17690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18690b6cddSBarry Smith PetscInt *diag = a->diag; 19f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2087828ca2SBarry Smith PetscScalar s1,*x,*b; 21f1af5d2fSBarry Smith 22f1af5d2fSBarry Smith PetscFunctionBegin; 23ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 241ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 251ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26f1af5d2fSBarry Smith 27f1af5d2fSBarry Smith /* forward solve the U^T */ 28f1af5d2fSBarry Smith for (i=0; i<n; i++) { 29f1af5d2fSBarry Smith 30f1af5d2fSBarry Smith v = aa + diag[i]; 31f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 32ef66eb69SBarry Smith s1 = (*v++)*x[i]; 33f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 34f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 35f1af5d2fSBarry Smith while (nz--) { 36f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 37f1af5d2fSBarry Smith } 38f1af5d2fSBarry Smith x[i] = s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith /* backward solve the L^T */ 41f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 42f1af5d2fSBarry Smith v = aa + diag[i] - 1; 43f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 44f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 45f1af5d2fSBarry Smith s1 = x[i]; 46f1af5d2fSBarry Smith while (nz--) { 47f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 48f1af5d2fSBarry Smith } 49f1af5d2fSBarry Smith } 501ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 511ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 53f1af5d2fSBarry Smith PetscFunctionReturn(0); 54f1af5d2fSBarry Smith } 55f1af5d2fSBarry Smith 56*27019359SHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 57*27019359SHong Zhang { 58*27019359SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 59*27019359SHong Zhang PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 60*27019359SHong Zhang PetscErrorCode ierr; 61*27019359SHong Zhang PetscInt jdx; 62*27019359SHong Zhang const MatScalar *aa=a->a,*v; 63*27019359SHong Zhang PetscScalar *x,s1,s2,x1,x2; 64*27019359SHong Zhang const PetscScalar *b; 65*27019359SHong Zhang 66*27019359SHong Zhang PetscFunctionBegin; 67*27019359SHong Zhang ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 68*27019359SHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 69*27019359SHong Zhang /* forward solve the lower triangular */ 70*27019359SHong Zhang idx = 0; 71*27019359SHong Zhang x[0] = b[idx]; x[1] = b[1+idx]; 72*27019359SHong Zhang for (i=1; i<n; i++) { 73*27019359SHong Zhang v = aa + 4*ai[i]; 74*27019359SHong Zhang vi = aj + ai[i]; 75*27019359SHong Zhang nz = ai[i+1] - ai[i]; 76*27019359SHong Zhang idx = 2*i; 77*27019359SHong Zhang s1 = b[idx];s2 = b[1+idx]; 78*27019359SHong Zhang while (nz--) { 79*27019359SHong Zhang jdx = 2*(*vi++); 80*27019359SHong Zhang x1 = x[jdx];x2 = x[1+jdx]; 81*27019359SHong Zhang s1 -= v[0]*x1 + v[2]*x2; 82*27019359SHong Zhang s2 -= v[1]*x1 + v[3]*x2; 83*27019359SHong Zhang v += 4; 84*27019359SHong Zhang } 85*27019359SHong Zhang x[idx] = s1; 86*27019359SHong Zhang x[1+idx] = s2; 87*27019359SHong Zhang } 88*27019359SHong Zhang 89*27019359SHong Zhang /* backward solve the upper triangular */ 90*27019359SHong Zhang for (i=n-1; i>=0; i--){ 91*27019359SHong Zhang v = aa + 4*ai[2*n-i]; 92*27019359SHong Zhang vi = aj + ai[2*n-i]; 93*27019359SHong Zhang nz = ai[2*n-i +1] - ai[2*n-i]-1; 94*27019359SHong Zhang idt = 2*i; 95*27019359SHong Zhang s1 = x[idt]; s2 = x[1+idt]; 96*27019359SHong Zhang while (nz--) { 97*27019359SHong Zhang idx = 2*(*vi++); 98*27019359SHong Zhang x1 = x[idx]; x2 = x[1+idx]; 99*27019359SHong Zhang s1 -= v[0]*x1 + v[2]*x2; 100*27019359SHong Zhang s2 -= v[1]*x1 + v[3]*x2; 101*27019359SHong Zhang v += 4; 102*27019359SHong Zhang } 103*27019359SHong Zhang /* x = inv_diagonal*x */ 104*27019359SHong Zhang x[idt] = v[0]*s1 + v[2]*s2; 105*27019359SHong Zhang x[1+idt] = v[1]*s1 + v[3]*s2; 106*27019359SHong Zhang } 107*27019359SHong Zhang 108*27019359SHong Zhang ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 109*27019359SHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 110*27019359SHong Zhang ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 111*27019359SHong Zhang PetscFunctionReturn(0); 112*27019359SHong Zhang } 113*27019359SHong Zhang 1144a2ae208SSatish Balay #undef __FUNCT__ 1154a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 116dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 117f1af5d2fSBarry Smith { 118f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 119dfbe8321SBarry Smith PetscErrorCode ierr; 120690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 121690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 122f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 12387828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 12487828ca2SBarry Smith PetscScalar *x,*b; 125f1af5d2fSBarry Smith 126f1af5d2fSBarry Smith PetscFunctionBegin; 127ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1281ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1291ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 130f1af5d2fSBarry Smith 131f1af5d2fSBarry Smith /* forward solve the U^T */ 132f1af5d2fSBarry Smith idx = 0; 133f1af5d2fSBarry Smith for (i=0; i<n; i++) { 134f1af5d2fSBarry Smith 135f1af5d2fSBarry Smith v = aa + 4*diag[i]; 136f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 137ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 138f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 139f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 140f1af5d2fSBarry Smith v += 4; 141f1af5d2fSBarry Smith 142f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 143f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 144f1af5d2fSBarry Smith while (nz--) { 145f1af5d2fSBarry Smith oidx = 2*(*vi++); 146f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 147f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 148f1af5d2fSBarry Smith v += 4; 149f1af5d2fSBarry Smith } 150f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 151f1af5d2fSBarry Smith idx += 2; 152f1af5d2fSBarry Smith } 153f1af5d2fSBarry Smith /* backward solve the L^T */ 154f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 155f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 156f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 157f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 158f1af5d2fSBarry Smith idt = 2*i; 159f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 160f1af5d2fSBarry Smith while (nz--) { 161f1af5d2fSBarry Smith idx = 2*(*vi--); 162f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 163f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 164f1af5d2fSBarry Smith v -= 4; 165f1af5d2fSBarry Smith } 166f1af5d2fSBarry Smith } 1671ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1681ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 169dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 170f1af5d2fSBarry Smith PetscFunctionReturn(0); 171f1af5d2fSBarry Smith } 172f1af5d2fSBarry Smith 1734a2ae208SSatish Balay #undef __FUNCT__ 1744a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 175dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 176f1af5d2fSBarry Smith { 177f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 178dfbe8321SBarry Smith PetscErrorCode ierr; 179690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 180690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 181f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18287828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 18387828ca2SBarry Smith PetscScalar *x,*b; 184f1af5d2fSBarry Smith 185f1af5d2fSBarry Smith PetscFunctionBegin; 186ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1871ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1881ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 189f1af5d2fSBarry Smith 190f1af5d2fSBarry Smith /* forward solve the U^T */ 191f1af5d2fSBarry Smith idx = 0; 192f1af5d2fSBarry Smith for (i=0; i<n; i++) { 193f1af5d2fSBarry Smith 194f1af5d2fSBarry Smith v = aa + 9*diag[i]; 195f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 196ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 197f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 198f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 199f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 200f1af5d2fSBarry Smith v += 9; 201f1af5d2fSBarry Smith 202f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 203f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 204f1af5d2fSBarry Smith while (nz--) { 205f1af5d2fSBarry Smith oidx = 3*(*vi++); 206f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 207f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 208f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 209f1af5d2fSBarry Smith v += 9; 210f1af5d2fSBarry Smith } 211f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 212f1af5d2fSBarry Smith idx += 3; 213f1af5d2fSBarry Smith } 214f1af5d2fSBarry Smith /* backward solve the L^T */ 215f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 216f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 217f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 218f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 219f1af5d2fSBarry Smith idt = 3*i; 220f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 221f1af5d2fSBarry Smith while (nz--) { 222f1af5d2fSBarry Smith idx = 3*(*vi--); 223f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 224f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 225f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 226f1af5d2fSBarry Smith v -= 9; 227f1af5d2fSBarry Smith } 228f1af5d2fSBarry Smith } 2291ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2301ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 231dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 232f1af5d2fSBarry Smith PetscFunctionReturn(0); 233f1af5d2fSBarry Smith } 234f1af5d2fSBarry Smith 2354a2ae208SSatish Balay #undef __FUNCT__ 2364a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 237dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 238f1af5d2fSBarry Smith { 239f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 240dfbe8321SBarry Smith PetscErrorCode ierr; 241690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 242690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 243f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 24487828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 24587828ca2SBarry Smith PetscScalar *x,*b; 246f1af5d2fSBarry Smith 247f1af5d2fSBarry Smith PetscFunctionBegin; 248ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2491ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2501ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 251f1af5d2fSBarry Smith 252f1af5d2fSBarry Smith /* forward solve the U^T */ 253f1af5d2fSBarry Smith idx = 0; 254f1af5d2fSBarry Smith for (i=0; i<n; i++) { 255f1af5d2fSBarry Smith 256f1af5d2fSBarry Smith v = aa + 16*diag[i]; 257f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 258ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 259f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 260f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 261f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 262f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 263f1af5d2fSBarry Smith v += 16; 264f1af5d2fSBarry Smith 265f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 266f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 267f1af5d2fSBarry Smith while (nz--) { 268f1af5d2fSBarry Smith oidx = 4*(*vi++); 269f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 270f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 271f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 272f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 273f1af5d2fSBarry Smith v += 16; 274f1af5d2fSBarry Smith } 275f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 276f1af5d2fSBarry Smith idx += 4; 277f1af5d2fSBarry Smith } 278f1af5d2fSBarry Smith /* backward solve the L^T */ 279f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 280f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 281f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 282f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 283f1af5d2fSBarry Smith idt = 4*i; 284f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 285f1af5d2fSBarry Smith while (nz--) { 286f1af5d2fSBarry Smith idx = 4*(*vi--); 287f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 288f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 289f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 290f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 291f1af5d2fSBarry Smith v -= 16; 292f1af5d2fSBarry Smith } 293f1af5d2fSBarry Smith } 2941ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2951ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 296dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 297f1af5d2fSBarry Smith PetscFunctionReturn(0); 298f1af5d2fSBarry Smith } 299f1af5d2fSBarry Smith 3004a2ae208SSatish Balay #undef __FUNCT__ 3014a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 302dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 303f1af5d2fSBarry Smith { 304f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 305dfbe8321SBarry Smith PetscErrorCode ierr; 306690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 307690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 308f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 30987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 31087828ca2SBarry Smith PetscScalar *x,*b; 311f1af5d2fSBarry Smith 312f1af5d2fSBarry Smith PetscFunctionBegin; 313ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3141ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3151ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 316f1af5d2fSBarry Smith 317f1af5d2fSBarry Smith /* forward solve the U^T */ 318f1af5d2fSBarry Smith idx = 0; 319f1af5d2fSBarry Smith for (i=0; i<n; i++) { 320f1af5d2fSBarry Smith 321f1af5d2fSBarry Smith v = aa + 25*diag[i]; 322f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 323ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 324f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 325f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 326f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 327f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 328f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 329f1af5d2fSBarry Smith v += 25; 330f1af5d2fSBarry Smith 331f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 332f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 333f1af5d2fSBarry Smith while (nz--) { 334f1af5d2fSBarry Smith oidx = 5*(*vi++); 335f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 336f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 337f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 338f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 339f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 340f1af5d2fSBarry Smith v += 25; 341f1af5d2fSBarry Smith } 342f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 343f1af5d2fSBarry Smith idx += 5; 344f1af5d2fSBarry Smith } 345f1af5d2fSBarry Smith /* backward solve the L^T */ 346f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 347f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 348f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 349f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 350f1af5d2fSBarry Smith idt = 5*i; 351f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 352f1af5d2fSBarry Smith while (nz--) { 353f1af5d2fSBarry Smith idx = 5*(*vi--); 354f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 355f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 356f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 357f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 358f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 359f1af5d2fSBarry Smith v -= 25; 360f1af5d2fSBarry Smith } 361f1af5d2fSBarry Smith } 3621ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3631ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 364dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 365f1af5d2fSBarry Smith PetscFunctionReturn(0); 366f1af5d2fSBarry Smith } 367f1af5d2fSBarry Smith 3684a2ae208SSatish Balay #undef __FUNCT__ 3694a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 370dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 371f1af5d2fSBarry Smith { 372f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 373dfbe8321SBarry Smith PetscErrorCode ierr; 374690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 375690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 376f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 37787828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 37887828ca2SBarry Smith PetscScalar *x,*b; 379f1af5d2fSBarry Smith 380f1af5d2fSBarry Smith PetscFunctionBegin; 381ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3821ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3831ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 384f1af5d2fSBarry Smith 385f1af5d2fSBarry Smith /* forward solve the U^T */ 386f1af5d2fSBarry Smith idx = 0; 387f1af5d2fSBarry Smith for (i=0; i<n; i++) { 388f1af5d2fSBarry Smith 389f1af5d2fSBarry Smith v = aa + 36*diag[i]; 390f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 391ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 392ef66eb69SBarry Smith x6 = x[5+idx]; 393f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 394f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 395f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 396f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 397f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 398f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 399f1af5d2fSBarry Smith v += 36; 400f1af5d2fSBarry Smith 401f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 402f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 403f1af5d2fSBarry Smith while (nz--) { 404f1af5d2fSBarry Smith oidx = 6*(*vi++); 405f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 406f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 407f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 408f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 409f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 410f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 411f1af5d2fSBarry Smith v += 36; 412f1af5d2fSBarry Smith } 413f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 414f1af5d2fSBarry Smith x[5+idx] = s6; 415f1af5d2fSBarry Smith idx += 6; 416f1af5d2fSBarry Smith } 417f1af5d2fSBarry Smith /* backward solve the L^T */ 418f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 419f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 420f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 421f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 422f1af5d2fSBarry Smith idt = 6*i; 423f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 424f1af5d2fSBarry Smith s6 = x[5+idt]; 425f1af5d2fSBarry Smith while (nz--) { 426f1af5d2fSBarry Smith idx = 6*(*vi--); 427f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 428f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 429f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 430f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 431f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 432f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 433f1af5d2fSBarry Smith v -= 36; 434f1af5d2fSBarry Smith } 435f1af5d2fSBarry Smith } 4361ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4371ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 438dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 439f1af5d2fSBarry Smith PetscFunctionReturn(0); 440f1af5d2fSBarry Smith } 441f1af5d2fSBarry Smith 4424a2ae208SSatish Balay #undef __FUNCT__ 4434a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 444dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 445f1af5d2fSBarry Smith { 446f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 447dfbe8321SBarry Smith PetscErrorCode ierr; 448690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 449690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 450f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 45187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 45287828ca2SBarry Smith PetscScalar *x,*b; 453f1af5d2fSBarry Smith 454f1af5d2fSBarry Smith PetscFunctionBegin; 455ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4561ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4571ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 458f1af5d2fSBarry Smith 459f1af5d2fSBarry Smith /* forward solve the U^T */ 460f1af5d2fSBarry Smith idx = 0; 461f1af5d2fSBarry Smith for (i=0; i<n; i++) { 462f1af5d2fSBarry Smith 463f1af5d2fSBarry Smith v = aa + 49*diag[i]; 464f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 465ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 466ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 467f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 468f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 469f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 470f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 471f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 472f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 473f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 474f1af5d2fSBarry Smith v += 49; 475f1af5d2fSBarry Smith 476f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 477f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 478f1af5d2fSBarry Smith while (nz--) { 479f1af5d2fSBarry Smith oidx = 7*(*vi++); 480f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 481f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 482f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 483f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 484f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 485f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 486f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 487f1af5d2fSBarry Smith v += 49; 488f1af5d2fSBarry Smith } 489f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 490f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 491f1af5d2fSBarry Smith idx += 7; 492f1af5d2fSBarry Smith } 493f1af5d2fSBarry Smith /* backward solve the L^T */ 494f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 495f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 496f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 497f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 498f1af5d2fSBarry Smith idt = 7*i; 499f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 500f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 501f1af5d2fSBarry Smith while (nz--) { 502f1af5d2fSBarry Smith idx = 7*(*vi--); 503f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 504f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 505f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 506f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 507f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 508f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 509f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 510f1af5d2fSBarry Smith v -= 49; 511f1af5d2fSBarry Smith } 512f1af5d2fSBarry Smith } 5131ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5141ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 515dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 516f1af5d2fSBarry Smith PetscFunctionReturn(0); 517f1af5d2fSBarry Smith } 518f1af5d2fSBarry Smith 519f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 5204a2ae208SSatish Balay #undef __FUNCT__ 5214a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 522dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 523f1af5d2fSBarry Smith { 524f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 525f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5266849ba73SBarry Smith PetscErrorCode ierr; 5275d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 5285d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 529690b6cddSBarry Smith PetscInt *diag = a->diag; 530f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 53187828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 532f1af5d2fSBarry Smith 533f1af5d2fSBarry Smith PetscFunctionBegin; 5341ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5351ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 536f1af5d2fSBarry Smith t = a->solve_work; 537f1af5d2fSBarry Smith 538f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 539f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 540f1af5d2fSBarry Smith 541f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 542f1af5d2fSBarry Smith for (i=0; i<n; i++) { 543f1af5d2fSBarry Smith t[i] = b[c[i]]; 544f1af5d2fSBarry Smith } 545f1af5d2fSBarry Smith 546f1af5d2fSBarry Smith /* forward solve the U^T */ 547f1af5d2fSBarry Smith for (i=0; i<n; i++) { 548f1af5d2fSBarry Smith 549f1af5d2fSBarry Smith v = aa + diag[i]; 550f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 551f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 552f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 553f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 554f1af5d2fSBarry Smith while (nz--) { 555f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 556f1af5d2fSBarry Smith } 557f1af5d2fSBarry Smith t[i] = s1; 558f1af5d2fSBarry Smith } 559f1af5d2fSBarry Smith /* backward solve the L^T */ 560f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 561f1af5d2fSBarry Smith v = aa + diag[i] - 1; 562f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 563f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 564f1af5d2fSBarry Smith s1 = t[i]; 565f1af5d2fSBarry Smith while (nz--) { 566f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 567f1af5d2fSBarry Smith } 568f1af5d2fSBarry Smith } 569f1af5d2fSBarry Smith 570f1af5d2fSBarry Smith /* copy t into x according to permutation */ 571f1af5d2fSBarry Smith for (i=0; i<n; i++) { 572f1af5d2fSBarry Smith x[r[i]] = t[i]; 573f1af5d2fSBarry Smith } 574f1af5d2fSBarry Smith 575f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 576f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5771ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5781ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 579dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 580f1af5d2fSBarry Smith PetscFunctionReturn(0); 581f1af5d2fSBarry Smith } 582f1af5d2fSBarry Smith 5834a2ae208SSatish Balay #undef __FUNCT__ 5844a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 585dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 586f1af5d2fSBarry Smith { 587f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 588f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 5896849ba73SBarry Smith PetscErrorCode ierr; 5905d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 5915d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 592690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 593f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 59487828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 59587828ca2SBarry Smith PetscScalar *x,*b,*t; 596f1af5d2fSBarry Smith 597f1af5d2fSBarry Smith PetscFunctionBegin; 5981ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5991ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 600f1af5d2fSBarry Smith t = a->solve_work; 601f1af5d2fSBarry Smith 602f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 603f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 604f1af5d2fSBarry Smith 605f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 606f1af5d2fSBarry Smith ii = 0; 607f1af5d2fSBarry Smith for (i=0; i<n; i++) { 608f1af5d2fSBarry Smith ic = 2*c[i]; 609f1af5d2fSBarry Smith t[ii] = b[ic]; 610f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 611f1af5d2fSBarry Smith ii += 2; 612f1af5d2fSBarry Smith } 613f1af5d2fSBarry Smith 614f1af5d2fSBarry Smith /* forward solve the U^T */ 615f1af5d2fSBarry Smith idx = 0; 616f1af5d2fSBarry Smith for (i=0; i<n; i++) { 617f1af5d2fSBarry Smith 618f1af5d2fSBarry Smith v = aa + 4*diag[i]; 619f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 620f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 621f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 622f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 623f1af5d2fSBarry Smith v += 4; 624f1af5d2fSBarry Smith 625f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 626f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 627f1af5d2fSBarry Smith while (nz--) { 628f1af5d2fSBarry Smith oidx = 2*(*vi++); 629f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 630f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 631f1af5d2fSBarry Smith v += 4; 632f1af5d2fSBarry Smith } 633f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 634f1af5d2fSBarry Smith idx += 2; 635f1af5d2fSBarry Smith } 636f1af5d2fSBarry Smith /* backward solve the L^T */ 637f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 638f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 639f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 640f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 641f1af5d2fSBarry Smith idt = 2*i; 642f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 643f1af5d2fSBarry Smith while (nz--) { 644f1af5d2fSBarry Smith idx = 2*(*vi--); 645f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 646f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 647f1af5d2fSBarry Smith v -= 4; 648f1af5d2fSBarry Smith } 649f1af5d2fSBarry Smith } 650f1af5d2fSBarry Smith 651f1af5d2fSBarry Smith /* copy t into x according to permutation */ 652f1af5d2fSBarry Smith ii = 0; 653f1af5d2fSBarry Smith for (i=0; i<n; i++) { 654f1af5d2fSBarry Smith ir = 2*r[i]; 655f1af5d2fSBarry Smith x[ir] = t[ii]; 656f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 657f1af5d2fSBarry Smith ii += 2; 658f1af5d2fSBarry Smith } 659f1af5d2fSBarry Smith 660f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 661f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 6621ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6631ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 664dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 665f1af5d2fSBarry Smith PetscFunctionReturn(0); 666f1af5d2fSBarry Smith } 667f1af5d2fSBarry Smith 6684a2ae208SSatish Balay #undef __FUNCT__ 6694a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 670dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 671f1af5d2fSBarry Smith { 672f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 673f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 6746849ba73SBarry Smith PetscErrorCode ierr; 6755d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 6765d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 677690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 678f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 67987828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 68087828ca2SBarry Smith PetscScalar *x,*b,*t; 681f1af5d2fSBarry Smith 682f1af5d2fSBarry Smith PetscFunctionBegin; 6831ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 6841ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 685f1af5d2fSBarry Smith t = a->solve_work; 686f1af5d2fSBarry Smith 687f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 688f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 689f1af5d2fSBarry Smith 690f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 691f1af5d2fSBarry Smith ii = 0; 692f1af5d2fSBarry Smith for (i=0; i<n; i++) { 693f1af5d2fSBarry Smith ic = 3*c[i]; 694f1af5d2fSBarry Smith t[ii] = b[ic]; 695f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 696f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 697f1af5d2fSBarry Smith ii += 3; 698f1af5d2fSBarry Smith } 699f1af5d2fSBarry Smith 700f1af5d2fSBarry Smith /* forward solve the U^T */ 701f1af5d2fSBarry Smith idx = 0; 702f1af5d2fSBarry Smith for (i=0; i<n; i++) { 703f1af5d2fSBarry Smith 704f1af5d2fSBarry Smith v = aa + 9*diag[i]; 705f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 706f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 707f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 708f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 709f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 710f1af5d2fSBarry Smith v += 9; 711f1af5d2fSBarry Smith 712f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 713f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 714f1af5d2fSBarry Smith while (nz--) { 715f1af5d2fSBarry Smith oidx = 3*(*vi++); 716f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 717f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 718f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 719f1af5d2fSBarry Smith v += 9; 720f1af5d2fSBarry Smith } 721f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 722f1af5d2fSBarry Smith idx += 3; 723f1af5d2fSBarry Smith } 724f1af5d2fSBarry Smith /* backward solve the L^T */ 725f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 726f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 727f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 728f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 729f1af5d2fSBarry Smith idt = 3*i; 730f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 731f1af5d2fSBarry Smith while (nz--) { 732f1af5d2fSBarry Smith idx = 3*(*vi--); 733f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 734f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 735f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 736f1af5d2fSBarry Smith v -= 9; 737f1af5d2fSBarry Smith } 738f1af5d2fSBarry Smith } 739f1af5d2fSBarry Smith 740f1af5d2fSBarry Smith /* copy t into x according to permutation */ 741f1af5d2fSBarry Smith ii = 0; 742f1af5d2fSBarry Smith for (i=0; i<n; i++) { 743f1af5d2fSBarry Smith ir = 3*r[i]; 744f1af5d2fSBarry Smith x[ir] = t[ii]; 745f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 746f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 747f1af5d2fSBarry Smith ii += 3; 748f1af5d2fSBarry Smith } 749f1af5d2fSBarry Smith 750f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 751f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 7521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 754dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 755f1af5d2fSBarry Smith PetscFunctionReturn(0); 756f1af5d2fSBarry Smith } 757f1af5d2fSBarry Smith 7584a2ae208SSatish Balay #undef __FUNCT__ 7594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 760dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 761f1af5d2fSBarry Smith { 762f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 763f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 7646849ba73SBarry Smith PetscErrorCode ierr; 7655d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 7665d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 767690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 768f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 76987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 77087828ca2SBarry Smith PetscScalar *x,*b,*t; 771f1af5d2fSBarry Smith 772f1af5d2fSBarry Smith PetscFunctionBegin; 7731ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7741ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 775f1af5d2fSBarry Smith t = a->solve_work; 776f1af5d2fSBarry Smith 777f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 778f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 779f1af5d2fSBarry Smith 780f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 781f1af5d2fSBarry Smith ii = 0; 782f1af5d2fSBarry Smith for (i=0; i<n; i++) { 783f1af5d2fSBarry Smith ic = 4*c[i]; 784f1af5d2fSBarry Smith t[ii] = b[ic]; 785f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 786f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 787f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 788f1af5d2fSBarry Smith ii += 4; 789f1af5d2fSBarry Smith } 790f1af5d2fSBarry Smith 791f1af5d2fSBarry Smith /* forward solve the U^T */ 792f1af5d2fSBarry Smith idx = 0; 793f1af5d2fSBarry Smith for (i=0; i<n; i++) { 794f1af5d2fSBarry Smith 795f1af5d2fSBarry Smith v = aa + 16*diag[i]; 796f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 797f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 798f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 799f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 800f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 801f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 802f1af5d2fSBarry Smith v += 16; 803f1af5d2fSBarry Smith 804f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 805f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 806f1af5d2fSBarry Smith while (nz--) { 807f1af5d2fSBarry Smith oidx = 4*(*vi++); 808f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 809f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 810f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 811f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 812f1af5d2fSBarry Smith v += 16; 813f1af5d2fSBarry Smith } 814f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 815f1af5d2fSBarry Smith idx += 4; 816f1af5d2fSBarry Smith } 817f1af5d2fSBarry Smith /* backward solve the L^T */ 818f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 819f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 820f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 821f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 822f1af5d2fSBarry Smith idt = 4*i; 823f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 824f1af5d2fSBarry Smith while (nz--) { 825f1af5d2fSBarry Smith idx = 4*(*vi--); 826f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 827f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 828f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 829f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 830f1af5d2fSBarry Smith v -= 16; 831f1af5d2fSBarry Smith } 832f1af5d2fSBarry Smith } 833f1af5d2fSBarry Smith 834f1af5d2fSBarry Smith /* copy t into x according to permutation */ 835f1af5d2fSBarry Smith ii = 0; 836f1af5d2fSBarry Smith for (i=0; i<n; i++) { 837f1af5d2fSBarry Smith ir = 4*r[i]; 838f1af5d2fSBarry Smith x[ir] = t[ii]; 839f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 840f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 841f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 842f1af5d2fSBarry Smith ii += 4; 843f1af5d2fSBarry Smith } 844f1af5d2fSBarry Smith 845f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 846f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 8471ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 8481ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 849dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 850f1af5d2fSBarry Smith PetscFunctionReturn(0); 851f1af5d2fSBarry Smith } 852f1af5d2fSBarry Smith 8534a2ae208SSatish Balay #undef __FUNCT__ 8544a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 855dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 856f1af5d2fSBarry Smith { 857f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 858f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8596849ba73SBarry Smith PetscErrorCode ierr; 8605d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8615d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 862690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 863f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 86487828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 86587828ca2SBarry Smith PetscScalar *x,*b,*t; 866f1af5d2fSBarry Smith 867f1af5d2fSBarry Smith PetscFunctionBegin; 8681ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8691ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 870f1af5d2fSBarry Smith t = a->solve_work; 871f1af5d2fSBarry Smith 872f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 873f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 874f1af5d2fSBarry Smith 875f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 876f1af5d2fSBarry Smith ii = 0; 877f1af5d2fSBarry Smith for (i=0; i<n; i++) { 878f1af5d2fSBarry Smith ic = 5*c[i]; 879f1af5d2fSBarry Smith t[ii] = b[ic]; 880f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 881f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 882f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 883f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 884f1af5d2fSBarry Smith ii += 5; 885f1af5d2fSBarry Smith } 886f1af5d2fSBarry Smith 887f1af5d2fSBarry Smith /* forward solve the U^T */ 888f1af5d2fSBarry Smith idx = 0; 889f1af5d2fSBarry Smith for (i=0; i<n; i++) { 890f1af5d2fSBarry Smith 891f1af5d2fSBarry Smith v = aa + 25*diag[i]; 892f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 893f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 894f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 895f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 896f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 897f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 898f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 899f1af5d2fSBarry Smith v += 25; 900f1af5d2fSBarry Smith 901f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 902f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 903f1af5d2fSBarry Smith while (nz--) { 904f1af5d2fSBarry Smith oidx = 5*(*vi++); 905f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 906f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 907f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 908f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 909f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 910f1af5d2fSBarry Smith v += 25; 911f1af5d2fSBarry Smith } 912f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 913f1af5d2fSBarry Smith idx += 5; 914f1af5d2fSBarry Smith } 915f1af5d2fSBarry Smith /* backward solve the L^T */ 916f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 917f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 918f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 919f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 920f1af5d2fSBarry Smith idt = 5*i; 921f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 922f1af5d2fSBarry Smith while (nz--) { 923f1af5d2fSBarry Smith idx = 5*(*vi--); 924f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 925f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 926f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 927f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 928f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 929f1af5d2fSBarry Smith v -= 25; 930f1af5d2fSBarry Smith } 931f1af5d2fSBarry Smith } 932f1af5d2fSBarry Smith 933f1af5d2fSBarry Smith /* copy t into x according to permutation */ 934f1af5d2fSBarry Smith ii = 0; 935f1af5d2fSBarry Smith for (i=0; i<n; i++) { 936f1af5d2fSBarry Smith ir = 5*r[i]; 937f1af5d2fSBarry Smith x[ir] = t[ii]; 938f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 939f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 940f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 941f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 942f1af5d2fSBarry Smith ii += 5; 943f1af5d2fSBarry Smith } 944f1af5d2fSBarry Smith 945f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 946f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9471ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 9481ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 949dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 950f1af5d2fSBarry Smith PetscFunctionReturn(0); 951f1af5d2fSBarry Smith } 952f1af5d2fSBarry Smith 9534a2ae208SSatish Balay #undef __FUNCT__ 9544a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 955dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 956f1af5d2fSBarry Smith { 957f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 958f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9596849ba73SBarry Smith PetscErrorCode ierr; 9605d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9615d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 962690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 963f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 96487828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 96587828ca2SBarry Smith PetscScalar *x,*b,*t; 966f1af5d2fSBarry Smith 967f1af5d2fSBarry Smith PetscFunctionBegin; 9681ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9691ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 970f1af5d2fSBarry Smith t = a->solve_work; 971f1af5d2fSBarry Smith 972f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 973f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 974f1af5d2fSBarry Smith 975f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 976f1af5d2fSBarry Smith ii = 0; 977f1af5d2fSBarry Smith for (i=0; i<n; i++) { 978f1af5d2fSBarry Smith ic = 6*c[i]; 979f1af5d2fSBarry Smith t[ii] = b[ic]; 980f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 981f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 982f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 983f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 984f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 985f1af5d2fSBarry Smith ii += 6; 986f1af5d2fSBarry Smith } 987f1af5d2fSBarry Smith 988f1af5d2fSBarry Smith /* forward solve the U^T */ 989f1af5d2fSBarry Smith idx = 0; 990f1af5d2fSBarry Smith for (i=0; i<n; i++) { 991f1af5d2fSBarry Smith 992f1af5d2fSBarry Smith v = aa + 36*diag[i]; 993f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 994f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 995f1af5d2fSBarry Smith x6 = t[5+idx]; 996f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 997f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 998f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 999f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1000f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1001f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1002f1af5d2fSBarry Smith v += 36; 1003f1af5d2fSBarry Smith 1004f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1005f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1006f1af5d2fSBarry Smith while (nz--) { 1007f1af5d2fSBarry Smith oidx = 6*(*vi++); 1008f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1009f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1010f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1011f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1012f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1013f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1014f1af5d2fSBarry Smith v += 36; 1015f1af5d2fSBarry Smith } 1016f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1017f1af5d2fSBarry Smith t[5+idx] = s6; 1018f1af5d2fSBarry Smith idx += 6; 1019f1af5d2fSBarry Smith } 1020f1af5d2fSBarry Smith /* backward solve the L^T */ 1021f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1022f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 1023f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1024f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1025f1af5d2fSBarry Smith idt = 6*i; 1026f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1027f1af5d2fSBarry Smith s6 = t[5+idt]; 1028f1af5d2fSBarry Smith while (nz--) { 1029f1af5d2fSBarry Smith idx = 6*(*vi--); 1030f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1031f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1032f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1033f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1034f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1035f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1036f1af5d2fSBarry Smith v -= 36; 1037f1af5d2fSBarry Smith } 1038f1af5d2fSBarry Smith } 1039f1af5d2fSBarry Smith 1040f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1041f1af5d2fSBarry Smith ii = 0; 1042f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1043f1af5d2fSBarry Smith ir = 6*r[i]; 1044f1af5d2fSBarry Smith x[ir] = t[ii]; 1045f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1046f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1047f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1048f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1049f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1050f1af5d2fSBarry Smith ii += 6; 1051f1af5d2fSBarry Smith } 1052f1af5d2fSBarry Smith 1053f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1054f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 10551ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 10561ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1057dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1058f1af5d2fSBarry Smith PetscFunctionReturn(0); 1059f1af5d2fSBarry Smith } 1060f1af5d2fSBarry Smith 10614a2ae208SSatish Balay #undef __FUNCT__ 10624a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1063dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1064f1af5d2fSBarry Smith { 1065f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1066f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10676849ba73SBarry Smith PetscErrorCode ierr; 10685d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 10695d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1070690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1071f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 107287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 107387828ca2SBarry Smith PetscScalar *x,*b,*t; 1074f1af5d2fSBarry Smith 1075f1af5d2fSBarry Smith PetscFunctionBegin; 10761ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10771ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1078f1af5d2fSBarry Smith t = a->solve_work; 1079f1af5d2fSBarry Smith 1080f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1081f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1082f1af5d2fSBarry Smith 1083f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1084f1af5d2fSBarry Smith ii = 0; 1085f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1086f1af5d2fSBarry Smith ic = 7*c[i]; 1087f1af5d2fSBarry Smith t[ii] = b[ic]; 1088f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1089f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1090f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1091f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1092f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1093f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1094f1af5d2fSBarry Smith ii += 7; 1095f1af5d2fSBarry Smith } 1096f1af5d2fSBarry Smith 1097f1af5d2fSBarry Smith /* forward solve the U^T */ 1098f1af5d2fSBarry Smith idx = 0; 1099f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1100f1af5d2fSBarry Smith 1101f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1102f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1103f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1104f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1105f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1106f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1107f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1108f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1109f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1110f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1111f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1112f1af5d2fSBarry Smith v += 49; 1113f1af5d2fSBarry Smith 1114f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1115f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1116f1af5d2fSBarry Smith while (nz--) { 1117f1af5d2fSBarry Smith oidx = 7*(*vi++); 1118f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1119f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1120f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1121f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1122f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1123f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1124f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1125f1af5d2fSBarry Smith v += 49; 1126f1af5d2fSBarry Smith } 1127f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1128f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1129f1af5d2fSBarry Smith idx += 7; 1130f1af5d2fSBarry Smith } 1131f1af5d2fSBarry Smith /* backward solve the L^T */ 1132f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1133f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1134f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1135f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1136f1af5d2fSBarry Smith idt = 7*i; 1137f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1138f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1139f1af5d2fSBarry Smith while (nz--) { 1140f1af5d2fSBarry Smith idx = 7*(*vi--); 1141f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1142f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1143f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1144f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1145f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1146f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1147f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1148f1af5d2fSBarry Smith v -= 49; 1149f1af5d2fSBarry Smith } 1150f1af5d2fSBarry Smith } 1151f1af5d2fSBarry Smith 1152f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1153f1af5d2fSBarry Smith ii = 0; 1154f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1155f1af5d2fSBarry Smith ir = 7*r[i]; 1156f1af5d2fSBarry Smith x[ir] = t[ii]; 1157f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1158f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1159f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1160f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1161f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1162f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1163f1af5d2fSBarry Smith ii += 7; 1164f1af5d2fSBarry Smith } 1165f1af5d2fSBarry Smith 1166f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1167f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11681ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11691ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1170dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1171f1af5d2fSBarry Smith PetscFunctionReturn(0); 1172f1af5d2fSBarry Smith } 1173f1af5d2fSBarry Smith 11744e2b4712SSatish Balay /* ----------------------------------------------------------- */ 11754a2ae208SSatish Balay #undef __FUNCT__ 11764a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1177dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 11784e2b4712SSatish Balay { 11794e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 11804e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 11816849ba73SBarry Smith PetscErrorCode ierr; 11825d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 11835d0c19d7SBarry Smith PetscInt i,n=a->mbs; 11845d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 11853f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 118687828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 11874e2b4712SSatish Balay 11884e2b4712SSatish Balay PetscFunctionBegin; 11891ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11901ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1191f1af5d2fSBarry Smith t = a->solve_work; 11924e2b4712SSatish Balay 11934e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 11944e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 11954e2b4712SSatish Balay 11964e2b4712SSatish Balay /* forward solve the lower triangular */ 119787828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 11984e2b4712SSatish Balay for (i=1; i<n; i++) { 11994e2b4712SSatish Balay v = aa + bs2*ai[i]; 12004e2b4712SSatish Balay vi = aj + ai[i]; 12014e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1202f1af5d2fSBarry Smith s = t + bs*i; 120387828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 12044e2b4712SSatish Balay while (nz--) { 1205f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 12064e2b4712SSatish Balay v += bs2; 12074e2b4712SSatish Balay } 12084e2b4712SSatish Balay } 12094e2b4712SSatish Balay /* backward solve the upper triangular */ 1210d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 12114e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12124e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 12134e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 12144e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 121587828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 12164e2b4712SSatish Balay while (nz--) { 1217f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 12184e2b4712SSatish Balay v += bs2; 12194e2b4712SSatish Balay } 1220f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 122187828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 12224e2b4712SSatish Balay } 12234e2b4712SSatish Balay 12244e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 12254e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12261ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 12271ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1228dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 12294e2b4712SSatish Balay PetscFunctionReturn(0); 12304e2b4712SSatish Balay } 12314e2b4712SSatish Balay 12324a2ae208SSatish Balay #undef __FUNCT__ 12334a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1234dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 12354e2b4712SSatish Balay { 12364e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 12374e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 12386849ba73SBarry Smith PetscErrorCode ierr; 12395d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 12405d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 12413f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 124287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 124387828ca2SBarry Smith PetscScalar *x,*b,*t; 12444e2b4712SSatish Balay 12454e2b4712SSatish Balay PetscFunctionBegin; 12461ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 12471ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1248f1af5d2fSBarry Smith t = a->solve_work; 12494e2b4712SSatish Balay 12504e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 12514e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 12524e2b4712SSatish Balay 12534e2b4712SSatish Balay /* forward solve the lower triangular */ 12544e2b4712SSatish Balay idx = 7*(*r++); 1255f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1256f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1257f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 12584e2b4712SSatish Balay 12594e2b4712SSatish Balay for (i=1; i<n; i++) { 12604e2b4712SSatish Balay v = aa + 49*ai[i]; 12614e2b4712SSatish Balay vi = aj + ai[i]; 12624e2b4712SSatish Balay nz = diag[i] - ai[i]; 12634e2b4712SSatish Balay idx = 7*(*r++); 1264f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1265f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 12664e2b4712SSatish Balay while (nz--) { 12674e2b4712SSatish Balay idx = 7*(*vi++); 1268f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1269f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1270f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1271f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1272f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1273f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1274f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1275f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1276f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1277f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 12784e2b4712SSatish Balay v += 49; 12794e2b4712SSatish Balay } 12804e2b4712SSatish Balay idx = 7*i; 1281f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1282f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1283f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 12844e2b4712SSatish Balay } 12854e2b4712SSatish Balay /* backward solve the upper triangular */ 12864e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 12874e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 12884e2b4712SSatish Balay vi = aj + diag[i] + 1; 12894e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 12904e2b4712SSatish Balay idt = 7*i; 1291f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1292f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1293f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 12944e2b4712SSatish Balay while (nz--) { 12954e2b4712SSatish Balay idx = 7*(*vi++); 1296f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1297f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1298f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1299f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1300f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1301f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1302f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1303f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1304f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1305f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 13064e2b4712SSatish Balay v += 49; 13074e2b4712SSatish Balay } 13084e2b4712SSatish Balay idc = 7*(*c--); 13094e2b4712SSatish Balay v = aa + 49*diag[i]; 1310f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1311f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1312f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1313f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1314f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1315f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1316f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1317f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1318f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1319f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1320f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1321f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1322f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1323f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 13244e2b4712SSatish Balay } 13254e2b4712SSatish Balay 13264e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 13274e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 13281ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 13291ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1330dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 13314e2b4712SSatish Balay PetscFunctionReturn(0); 13324e2b4712SSatish Balay } 13334e2b4712SSatish Balay 13344a2ae208SSatish Balay #undef __FUNCT__ 13354a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1336dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 133715091d37SBarry Smith { 133815091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1339690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1340dfbe8321SBarry Smith PetscErrorCode ierr; 1341690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1342d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1343d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1344d9fead3dSBarry Smith const PetscScalar *b; 134515091d37SBarry Smith 134615091d37SBarry Smith PetscFunctionBegin; 1347d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 13481ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 134915091d37SBarry Smith /* forward solve the lower triangular */ 135015091d37SBarry Smith idx = 0; 135115091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 135215091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 135315091d37SBarry Smith x[6] = b[6+idx]; 135415091d37SBarry Smith for (i=1; i<n; i++) { 135515091d37SBarry Smith v = aa + 49*ai[i]; 135615091d37SBarry Smith vi = aj + ai[i]; 135715091d37SBarry Smith nz = diag[i] - ai[i]; 135815091d37SBarry Smith idx = 7*i; 1359f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1360f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1361f1af5d2fSBarry Smith s7 = b[6+idx]; 136215091d37SBarry Smith while (nz--) { 136315091d37SBarry Smith jdx = 7*(*vi++); 136415091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 136515091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 136615091d37SBarry Smith x7 = x[6+jdx]; 1367f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1368f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1369f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1370f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1371f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1372f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1373f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 137415091d37SBarry Smith v += 49; 137515091d37SBarry Smith } 1376f1af5d2fSBarry Smith x[idx] = s1; 1377f1af5d2fSBarry Smith x[1+idx] = s2; 1378f1af5d2fSBarry Smith x[2+idx] = s3; 1379f1af5d2fSBarry Smith x[3+idx] = s4; 1380f1af5d2fSBarry Smith x[4+idx] = s5; 1381f1af5d2fSBarry Smith x[5+idx] = s6; 1382f1af5d2fSBarry Smith x[6+idx] = s7; 138315091d37SBarry Smith } 138415091d37SBarry Smith /* backward solve the upper triangular */ 138515091d37SBarry Smith for (i=n-1; i>=0; i--){ 138615091d37SBarry Smith v = aa + 49*diag[i] + 49; 138715091d37SBarry Smith vi = aj + diag[i] + 1; 138815091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 138915091d37SBarry Smith idt = 7*i; 1390f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1391f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1392f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1393f1af5d2fSBarry Smith s7 = x[6+idt]; 139415091d37SBarry Smith while (nz--) { 139515091d37SBarry Smith idx = 7*(*vi++); 139615091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 139715091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 139815091d37SBarry Smith x7 = x[6+idx]; 1399f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1400f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1401f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1402f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1403f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1404f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1405f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 140615091d37SBarry Smith v += 49; 140715091d37SBarry Smith } 140815091d37SBarry Smith v = aa + 49*diag[i]; 1409f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1410f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1411f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1412f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 1413f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1414f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 1415f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1416f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 1417f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1418f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 1419f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1420f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 1421f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1422f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 142315091d37SBarry Smith } 142415091d37SBarry Smith 1425d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14261ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1427dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 142815091d37SBarry Smith PetscFunctionReturn(0); 142915091d37SBarry Smith } 143015091d37SBarry Smith 14314a2ae208SSatish Balay #undef __FUNCT__ 14324a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1433dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 143415091d37SBarry Smith { 143515091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 143615091d37SBarry Smith IS iscol=a->col,isrow=a->row; 14376849ba73SBarry Smith PetscErrorCode ierr; 14385d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 14395d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1440d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1441d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1442d9fead3dSBarry Smith const PetscScalar *b; 144315091d37SBarry Smith PetscFunctionBegin; 1444d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 14451ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1446f1af5d2fSBarry Smith t = a->solve_work; 144715091d37SBarry Smith 144815091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 144915091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 145015091d37SBarry Smith 145115091d37SBarry Smith /* forward solve the lower triangular */ 145215091d37SBarry Smith idx = 6*(*r++); 1453f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1454f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 1455f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 145615091d37SBarry Smith for (i=1; i<n; i++) { 145715091d37SBarry Smith v = aa + 36*ai[i]; 145815091d37SBarry Smith vi = aj + ai[i]; 145915091d37SBarry Smith nz = diag[i] - ai[i]; 146015091d37SBarry Smith idx = 6*(*r++); 1461f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1462f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 146315091d37SBarry Smith while (nz--) { 146415091d37SBarry Smith idx = 6*(*vi++); 1465f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1466f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1467f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1468f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1469f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1470f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1471f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1472f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 147315091d37SBarry Smith v += 36; 147415091d37SBarry Smith } 147515091d37SBarry Smith idx = 6*i; 1476f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1477f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 1478f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 147915091d37SBarry Smith } 148015091d37SBarry Smith /* backward solve the upper triangular */ 148115091d37SBarry Smith for (i=n-1; i>=0; i--){ 148215091d37SBarry Smith v = aa + 36*diag[i] + 36; 148315091d37SBarry Smith vi = aj + diag[i] + 1; 148415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 148515091d37SBarry Smith idt = 6*i; 1486f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1487f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 1488f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 148915091d37SBarry Smith while (nz--) { 149015091d37SBarry Smith idx = 6*(*vi++); 1491f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1492f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1493f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 1494f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1495f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1496f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1497f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1498f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1499f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 150015091d37SBarry Smith v += 36; 150115091d37SBarry Smith } 150215091d37SBarry Smith idc = 6*(*c--); 150315091d37SBarry Smith v = aa + 36*diag[i]; 1504f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1505f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 1506f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1507f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 1508f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1509f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 1510f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1511f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 1512f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1513f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 1514f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1515f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 151615091d37SBarry Smith } 151715091d37SBarry Smith 151815091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 151915091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1520d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15211ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1522dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 152315091d37SBarry Smith PetscFunctionReturn(0); 152415091d37SBarry Smith } 152515091d37SBarry Smith 15264a2ae208SSatish Balay #undef __FUNCT__ 15274a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1528dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 152915091d37SBarry Smith { 153015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1531690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1532dfbe8321SBarry Smith PetscErrorCode ierr; 1533690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1534d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1535d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1536d9fead3dSBarry Smith const PetscScalar *b; 153715091d37SBarry Smith 153815091d37SBarry Smith PetscFunctionBegin; 1539d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15401ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 154115091d37SBarry Smith /* forward solve the lower triangular */ 154215091d37SBarry Smith idx = 0; 154315091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 154415091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 154515091d37SBarry Smith for (i=1; i<n; i++) { 154615091d37SBarry Smith v = aa + 36*ai[i]; 154715091d37SBarry Smith vi = aj + ai[i]; 154815091d37SBarry Smith nz = diag[i] - ai[i]; 154915091d37SBarry Smith idx = 6*i; 1550f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1551f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 155215091d37SBarry Smith while (nz--) { 155315091d37SBarry Smith jdx = 6*(*vi++); 155415091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 155515091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1556f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1557f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1558f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1559f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1560f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1561f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 156215091d37SBarry Smith v += 36; 156315091d37SBarry Smith } 1564f1af5d2fSBarry Smith x[idx] = s1; 1565f1af5d2fSBarry Smith x[1+idx] = s2; 1566f1af5d2fSBarry Smith x[2+idx] = s3; 1567f1af5d2fSBarry Smith x[3+idx] = s4; 1568f1af5d2fSBarry Smith x[4+idx] = s5; 1569f1af5d2fSBarry Smith x[5+idx] = s6; 157015091d37SBarry Smith } 157115091d37SBarry Smith /* backward solve the upper triangular */ 157215091d37SBarry Smith for (i=n-1; i>=0; i--){ 157315091d37SBarry Smith v = aa + 36*diag[i] + 36; 157415091d37SBarry Smith vi = aj + diag[i] + 1; 157515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 157615091d37SBarry Smith idt = 6*i; 1577f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1578f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1579f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 158015091d37SBarry Smith while (nz--) { 158115091d37SBarry Smith idx = 6*(*vi++); 158215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 158315091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1584f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1585f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1586f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1587f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1588f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1589f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 159015091d37SBarry Smith v += 36; 159115091d37SBarry Smith } 159215091d37SBarry Smith v = aa + 36*diag[i]; 1593f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1594f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1595f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1596f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1597f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1598f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 159915091d37SBarry Smith } 160015091d37SBarry Smith 1601d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16021ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1603dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 160415091d37SBarry Smith PetscFunctionReturn(0); 160515091d37SBarry Smith } 160615091d37SBarry Smith 16074a2ae208SSatish Balay #undef __FUNCT__ 16084a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1609dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 16104e2b4712SSatish Balay { 16114e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16124e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 16136849ba73SBarry Smith PetscErrorCode ierr; 16145d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 16155d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1616d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1617d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1618d9fead3dSBarry Smith const PetscScalar *b; 16194e2b4712SSatish Balay 16204e2b4712SSatish Balay PetscFunctionBegin; 1621d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16221ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1623f1af5d2fSBarry Smith t = a->solve_work; 16244e2b4712SSatish Balay 16254e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 16264e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 16274e2b4712SSatish Balay 16284e2b4712SSatish Balay /* forward solve the lower triangular */ 16294e2b4712SSatish Balay idx = 5*(*r++); 1630f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1631f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 16324e2b4712SSatish Balay for (i=1; i<n; i++) { 16334e2b4712SSatish Balay v = aa + 25*ai[i]; 16344e2b4712SSatish Balay vi = aj + ai[i]; 16354e2b4712SSatish Balay nz = diag[i] - ai[i]; 16364e2b4712SSatish Balay idx = 5*(*r++); 1637f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1638f1af5d2fSBarry Smith s5 = b[4+idx]; 16394e2b4712SSatish Balay while (nz--) { 16404e2b4712SSatish Balay idx = 5*(*vi++); 1641f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1642f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1643f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1644f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1645f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1646f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1647f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 16484e2b4712SSatish Balay v += 25; 16494e2b4712SSatish Balay } 16504e2b4712SSatish Balay idx = 5*i; 1651f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1652f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 16534e2b4712SSatish Balay } 16544e2b4712SSatish Balay /* backward solve the upper triangular */ 16554e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 16564e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 16574e2b4712SSatish Balay vi = aj + diag[i] + 1; 16584e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 16594e2b4712SSatish Balay idt = 5*i; 1660f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1661f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 16624e2b4712SSatish Balay while (nz--) { 16634e2b4712SSatish Balay idx = 5*(*vi++); 1664f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1665f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1666f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1667f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1668f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1669f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1670f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 16714e2b4712SSatish Balay v += 25; 16724e2b4712SSatish Balay } 16734e2b4712SSatish Balay idc = 5*(*c--); 16744e2b4712SSatish Balay v = aa + 25*diag[i]; 1675f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1676f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 1677f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1678f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 1679f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1680f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 1681f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1682f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 1683f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1684f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 16854e2b4712SSatish Balay } 16864e2b4712SSatish Balay 16874e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 16884e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1689d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16901ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1691dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 16924e2b4712SSatish Balay PetscFunctionReturn(0); 16934e2b4712SSatish Balay } 16944e2b4712SSatish Balay 169584a281e5SHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 169684a281e5SHong Zhang { 169784a281e5SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 169884a281e5SHong Zhang PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 169984a281e5SHong Zhang PetscErrorCode ierr; 170084a281e5SHong Zhang PetscInt jdx; 170184a281e5SHong Zhang const MatScalar *aa=a->a,*v; 170284a281e5SHong Zhang PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 170384a281e5SHong Zhang const PetscScalar *b; 170484a281e5SHong Zhang 170584a281e5SHong Zhang PetscFunctionBegin; 170684a281e5SHong Zhang ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 170784a281e5SHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 170884a281e5SHong Zhang /* forward solve the lower triangular */ 170984a281e5SHong Zhang idx = 0; 171084a281e5SHong Zhang x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 171184a281e5SHong Zhang for (i=1; i<n; i++) { 171284a281e5SHong Zhang v = aa + 25*ai[i]; 171384a281e5SHong Zhang vi = aj + ai[i]; 171484a281e5SHong Zhang nz = ai[i+1] - ai[i]; 171584a281e5SHong Zhang idx = 5*i; 171684a281e5SHong Zhang s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 171784a281e5SHong Zhang while (nz--) { 171884a281e5SHong Zhang jdx = 5*(*vi++); 171984a281e5SHong Zhang x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 172084a281e5SHong Zhang s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 172184a281e5SHong Zhang s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 172284a281e5SHong Zhang s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 172384a281e5SHong Zhang s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 172484a281e5SHong Zhang s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 172584a281e5SHong Zhang v += 25; 172684a281e5SHong Zhang } 172784a281e5SHong Zhang x[idx] = s1; 172884a281e5SHong Zhang x[1+idx] = s2; 172984a281e5SHong Zhang x[2+idx] = s3; 173084a281e5SHong Zhang x[3+idx] = s4; 173184a281e5SHong Zhang x[4+idx] = s5; 173284a281e5SHong Zhang } 173384a281e5SHong Zhang 173484a281e5SHong Zhang /* backward solve the upper triangular */ 173584a281e5SHong Zhang for (i=n-1; i>=0; i--){ 173684a281e5SHong Zhang v = aa + 25*ai[2*n-i]; 173784a281e5SHong Zhang vi = aj + ai[2*n-i]; 173884a281e5SHong Zhang nz = ai[2*n-i +1] - ai[2*n-i]-1; 173984a281e5SHong Zhang idt = 5*i; 174084a281e5SHong Zhang s1 = x[idt]; s2 = x[1+idt]; 174184a281e5SHong Zhang s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 174284a281e5SHong Zhang while (nz--) { 174384a281e5SHong Zhang idx = 5*(*vi++); 174484a281e5SHong Zhang x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 174584a281e5SHong Zhang s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 174684a281e5SHong Zhang s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 174784a281e5SHong Zhang s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 174884a281e5SHong Zhang s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 174984a281e5SHong Zhang s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 175084a281e5SHong Zhang v += 25; 175184a281e5SHong Zhang } 175284a281e5SHong Zhang /* x = inv_diagonal*x */ 175384a281e5SHong Zhang x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 175484a281e5SHong Zhang x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 175584a281e5SHong Zhang x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 175684a281e5SHong Zhang x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 175784a281e5SHong Zhang x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 175884a281e5SHong Zhang } 175984a281e5SHong Zhang 176084a281e5SHong Zhang ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 176184a281e5SHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 176284a281e5SHong Zhang ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 176384a281e5SHong Zhang PetscFunctionReturn(0); 176484a281e5SHong Zhang } 176584a281e5SHong Zhang 17664a2ae208SSatish Balay #undef __FUNCT__ 17674a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1768dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 176915091d37SBarry Smith { 177015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1771690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1772dfbe8321SBarry Smith PetscErrorCode ierr; 1773690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1774d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1775d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1776d9fead3dSBarry Smith const PetscScalar *b; 177715091d37SBarry Smith 177815091d37SBarry Smith PetscFunctionBegin; 1779d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 17801ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 178115091d37SBarry Smith /* forward solve the lower triangular */ 178215091d37SBarry Smith idx = 0; 178315091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 178415091d37SBarry Smith for (i=1; i<n; i++) { 178515091d37SBarry Smith v = aa + 25*ai[i]; 178615091d37SBarry Smith vi = aj + ai[i]; 178715091d37SBarry Smith nz = diag[i] - ai[i]; 178815091d37SBarry Smith idx = 5*i; 1789f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 179015091d37SBarry Smith while (nz--) { 179115091d37SBarry Smith jdx = 5*(*vi++); 179215091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1793f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1794f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1795f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1796f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1797f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 179815091d37SBarry Smith v += 25; 179915091d37SBarry Smith } 1800f1af5d2fSBarry Smith x[idx] = s1; 1801f1af5d2fSBarry Smith x[1+idx] = s2; 1802f1af5d2fSBarry Smith x[2+idx] = s3; 1803f1af5d2fSBarry Smith x[3+idx] = s4; 1804f1af5d2fSBarry Smith x[4+idx] = s5; 180515091d37SBarry Smith } 180615091d37SBarry Smith /* backward solve the upper triangular */ 180715091d37SBarry Smith for (i=n-1; i>=0; i--){ 180815091d37SBarry Smith v = aa + 25*diag[i] + 25; 180915091d37SBarry Smith vi = aj + diag[i] + 1; 181015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 181115091d37SBarry Smith idt = 5*i; 1812f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1813f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 181415091d37SBarry Smith while (nz--) { 181515091d37SBarry Smith idx = 5*(*vi++); 181615091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1817f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1818f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1819f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1820f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1821f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 182215091d37SBarry Smith v += 25; 182315091d37SBarry Smith } 182415091d37SBarry Smith v = aa + 25*diag[i]; 1825f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1826f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1827f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1828f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1829f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 183015091d37SBarry Smith } 183115091d37SBarry Smith 1832d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18331ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1834dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 183515091d37SBarry Smith PetscFunctionReturn(0); 183615091d37SBarry Smith } 183715091d37SBarry Smith 18384a2ae208SSatish Balay #undef __FUNCT__ 18394a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1840dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 18414e2b4712SSatish Balay { 18424e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 18434e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 18446849ba73SBarry Smith PetscErrorCode ierr; 18455d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 18465d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1847d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1848d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1849d9fead3dSBarry Smith const PetscScalar *b; 18504e2b4712SSatish Balay 18514e2b4712SSatish Balay PetscFunctionBegin; 1852d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 18531ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1854f1af5d2fSBarry Smith t = a->solve_work; 18554e2b4712SSatish Balay 18564e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 18574e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 18584e2b4712SSatish Balay 18594e2b4712SSatish Balay /* forward solve the lower triangular */ 18604e2b4712SSatish Balay idx = 4*(*r++); 1861f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1862f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 18634e2b4712SSatish Balay for (i=1; i<n; i++) { 18644e2b4712SSatish Balay v = aa + 16*ai[i]; 18654e2b4712SSatish Balay vi = aj + ai[i]; 18664e2b4712SSatish Balay nz = diag[i] - ai[i]; 18674e2b4712SSatish Balay idx = 4*(*r++); 1868f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 18694e2b4712SSatish Balay while (nz--) { 18704e2b4712SSatish Balay idx = 4*(*vi++); 1871f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1872f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1873f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1874f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1875f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 18764e2b4712SSatish Balay v += 16; 18774e2b4712SSatish Balay } 18784e2b4712SSatish Balay idx = 4*i; 1879f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1880f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 18814e2b4712SSatish Balay } 18824e2b4712SSatish Balay /* backward solve the upper triangular */ 18834e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 18844e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 18854e2b4712SSatish Balay vi = aj + diag[i] + 1; 18864e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 18874e2b4712SSatish Balay idt = 4*i; 1888f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1889f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 18904e2b4712SSatish Balay while (nz--) { 18914e2b4712SSatish Balay idx = 4*(*vi++); 1892f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1893f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 1894f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1895f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1896f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1897f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 18984e2b4712SSatish Balay v += 16; 18994e2b4712SSatish Balay } 19004e2b4712SSatish Balay idc = 4*(*c--); 19014e2b4712SSatish Balay v = aa + 16*diag[i]; 1902f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1903f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1904f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1905f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 19064e2b4712SSatish Balay } 19074e2b4712SSatish Balay 19084e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 19094e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1910d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19111ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1912dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 19134e2b4712SSatish Balay PetscFunctionReturn(0); 19144e2b4712SSatish Balay } 1915f26ec98cSKris Buschelman 1916f26ec98cSKris Buschelman #undef __FUNCT__ 1917f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1918dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1919f26ec98cSKris Buschelman { 1920f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1921f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 19226849ba73SBarry Smith PetscErrorCode ierr; 19235d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 19245d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1925d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1926d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 1927d9fead3dSBarry Smith PetscScalar *x; 1928d9fead3dSBarry Smith const PetscScalar *b; 1929f26ec98cSKris Buschelman 1930f26ec98cSKris Buschelman PetscFunctionBegin; 1931d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1933f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 1934f26ec98cSKris Buschelman 1935f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1936f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1937f26ec98cSKris Buschelman 1938f26ec98cSKris Buschelman /* forward solve the lower triangular */ 1939f26ec98cSKris Buschelman idx = 4*(*r++); 1940f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 1941f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 1942f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 1943f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 1944f26ec98cSKris Buschelman for (i=1; i<n; i++) { 1945f26ec98cSKris Buschelman v = aa + 16*ai[i]; 1946f26ec98cSKris Buschelman vi = aj + ai[i]; 1947f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 1948f26ec98cSKris Buschelman idx = 4*(*r++); 1949f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 1950f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 1951f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 1952f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 1953f26ec98cSKris Buschelman while (nz--) { 1954f26ec98cSKris Buschelman idx = 4*(*vi++); 1955f26ec98cSKris Buschelman x1 = t[idx]; 1956f26ec98cSKris Buschelman x2 = t[1+idx]; 1957f26ec98cSKris Buschelman x3 = t[2+idx]; 1958f26ec98cSKris Buschelman x4 = t[3+idx]; 1959f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1960f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1961f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1962f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1963f26ec98cSKris Buschelman v += 16; 1964f26ec98cSKris Buschelman } 1965f26ec98cSKris Buschelman idx = 4*i; 1966f26ec98cSKris Buschelman t[idx] = s1; 1967f26ec98cSKris Buschelman t[1+idx] = s2; 1968f26ec98cSKris Buschelman t[2+idx] = s3; 1969f26ec98cSKris Buschelman t[3+idx] = s4; 1970f26ec98cSKris Buschelman } 1971f26ec98cSKris Buschelman /* backward solve the upper triangular */ 1972f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 1973f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 1974f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 1975f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 1976f26ec98cSKris Buschelman idt = 4*i; 1977f26ec98cSKris Buschelman s1 = t[idt]; 1978f26ec98cSKris Buschelman s2 = t[1+idt]; 1979f26ec98cSKris Buschelman s3 = t[2+idt]; 1980f26ec98cSKris Buschelman s4 = t[3+idt]; 1981f26ec98cSKris Buschelman while (nz--) { 1982f26ec98cSKris Buschelman idx = 4*(*vi++); 1983f26ec98cSKris Buschelman x1 = t[idx]; 1984f26ec98cSKris Buschelman x2 = t[1+idx]; 1985f26ec98cSKris Buschelman x3 = t[2+idx]; 1986f26ec98cSKris Buschelman x4 = t[3+idx]; 1987f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1988f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1989f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1990f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1991f26ec98cSKris Buschelman v += 16; 1992f26ec98cSKris Buschelman } 1993f26ec98cSKris Buschelman idc = 4*(*c--); 1994f26ec98cSKris Buschelman v = aa + 16*diag[i]; 1995f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1996f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1997f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1998f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1999f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 2000f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 2001f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 2002f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 2003f26ec98cSKris Buschelman } 2004f26ec98cSKris Buschelman 2005f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2006f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2007d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20081ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2009dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2010f26ec98cSKris Buschelman PetscFunctionReturn(0); 2011f26ec98cSKris Buschelman } 2012f26ec98cSKris Buschelman 201324c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 201424c233c2SKris Buschelman 201524c233c2SKris Buschelman #include PETSC_HAVE_SSE 201624c233c2SKris Buschelman 201724c233c2SKris Buschelman #undef __FUNCT__ 201824c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 2019dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 202024c233c2SKris Buschelman { 202124c233c2SKris Buschelman /* 202224c233c2SKris Buschelman Note: This code uses demotion of double 202324c233c2SKris Buschelman to float when performing the mixed-mode computation. 202424c233c2SKris Buschelman This may not be numerically reasonable for all applications. 202524c233c2SKris Buschelman */ 202624c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 202724c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 20286849ba73SBarry Smith PetscErrorCode ierr; 20295d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 20305d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 203124c233c2SKris Buschelman MatScalar *aa=a->a,*v; 203287828ca2SBarry Smith PetscScalar *x,*b,*t; 203324c233c2SKris Buschelman 203424c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 203524c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 203624c233c2SKris Buschelman unsigned long offset; 203724c233c2SKris Buschelman 203824c233c2SKris Buschelman PetscFunctionBegin; 203924c233c2SKris Buschelman SSE_SCOPE_BEGIN; 204024c233c2SKris Buschelman 204124c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 204224c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 204324c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 204424c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 204524c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 204624c233c2SKris Buschelman 20471ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 20481ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 204924c233c2SKris Buschelman t = a->solve_work; 205024c233c2SKris Buschelman 205124c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 205224c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 205324c233c2SKris Buschelman 205424c233c2SKris Buschelman /* forward solve the lower triangular */ 205524c233c2SKris Buschelman idx = 4*(*r++); 205624c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 205724c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 205824c233c2SKris Buschelman v = aa + 16*ai[1]; 205924c233c2SKris Buschelman 206024c233c2SKris Buschelman for (i=1; i<n;) { 206124c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 206224c233c2SKris Buschelman vi = aj + ai[i]; 206324c233c2SKris Buschelman nz = diag[i] - ai[i]; 206424c233c2SKris Buschelman idx = 4*(*r++); 206524c233c2SKris Buschelman 206624c233c2SKris Buschelman /* Demote sum from double to float */ 206724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 206824c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 206924c233c2SKris Buschelman 207024c233c2SKris Buschelman while (nz--) { 207124c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 207224c233c2SKris Buschelman idx = 4*(*vi++); 207324c233c2SKris Buschelman 207424c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 207524c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 207624c233c2SKris Buschelman 207724c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 207824c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 207924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 208024c233c2SKris Buschelman 208124c233c2SKris Buschelman /* First Column */ 208224c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 208324c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 208424c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 208524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 208624c233c2SKris Buschelman 208724c233c2SKris Buschelman /* Second Column */ 208824c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 208924c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 209024c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 209124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 209224c233c2SKris Buschelman 209324c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 209424c233c2SKris Buschelman 209524c233c2SKris Buschelman /* Third Column */ 209624c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 209724c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 209824c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 209924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 210024c233c2SKris Buschelman 210124c233c2SKris Buschelman /* Fourth Column */ 210224c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 210324c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 210424c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 210524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 210624c233c2SKris Buschelman SSE_INLINE_END_2 210724c233c2SKris Buschelman 210824c233c2SKris Buschelman v += 16; 210924c233c2SKris Buschelman } 211024c233c2SKris Buschelman idx = 4*i; 211124c233c2SKris Buschelman v = aa + 16*ai[++i]; 211224c233c2SKris Buschelman PREFETCH_NTA(v); 211324c233c2SKris Buschelman STORE_PS(tmps,XMM7); 211424c233c2SKris Buschelman 211524c233c2SKris Buschelman /* Promote result from float to double */ 211624c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 211724c233c2SKris Buschelman } 211824c233c2SKris Buschelman /* backward solve the upper triangular */ 211924c233c2SKris Buschelman idt = 4*(n-1); 212024c233c2SKris Buschelman ai16 = 16*diag[n-1]; 212124c233c2SKris Buschelman v = aa + ai16 + 16; 212224c233c2SKris Buschelman for (i=n-1; i>=0;){ 212324c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 212424c233c2SKris Buschelman vi = aj + diag[i] + 1; 212524c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 212624c233c2SKris Buschelman 212724c233c2SKris Buschelman /* Demote accumulator from double to float */ 212824c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 212924c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 213024c233c2SKris Buschelman 213124c233c2SKris Buschelman while (nz--) { 213224c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 213324c233c2SKris Buschelman idx = 4*(*vi++); 213424c233c2SKris Buschelman 213524c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 213624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 213724c233c2SKris Buschelman 213824c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 213924c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 214024c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 214124c233c2SKris Buschelman 214224c233c2SKris Buschelman /* First Column */ 214324c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 214424c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 214524c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 214624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 214724c233c2SKris Buschelman 214824c233c2SKris Buschelman /* Second Column */ 214924c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 215024c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 215124c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 215224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 215324c233c2SKris Buschelman 215424c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 215524c233c2SKris Buschelman 215624c233c2SKris Buschelman /* Third Column */ 215724c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 215824c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 215924c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 216024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 216124c233c2SKris Buschelman 216224c233c2SKris Buschelman /* Fourth Column */ 216324c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 216424c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 216524c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 216624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 216724c233c2SKris Buschelman SSE_INLINE_END_2 216824c233c2SKris Buschelman v += 16; 216924c233c2SKris Buschelman } 217024c233c2SKris Buschelman v = aa + ai16; 217124c233c2SKris Buschelman ai16 = 16*diag[--i]; 217224c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 217324c233c2SKris Buschelman /* 217424c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 217524c233c2SKris Buschelman which was inverted as part of the factorization 217624c233c2SKris Buschelman */ 217724c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 217824c233c2SKris Buschelman /* First Column */ 217924c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 218024c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 218124c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 218224c233c2SKris Buschelman 218324c233c2SKris Buschelman /* Second Column */ 218424c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 218524c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 218624c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 218724c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 218824c233c2SKris Buschelman 218924c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 219024c233c2SKris Buschelman 219124c233c2SKris Buschelman /* Third Column */ 219224c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 219324c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 219424c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 219524c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 219624c233c2SKris Buschelman 219724c233c2SKris Buschelman /* Fourth Column */ 219824c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 219924c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 220024c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 220124c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 220224c233c2SKris Buschelman 220324c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 220424c233c2SKris Buschelman SSE_INLINE_END_3 220524c233c2SKris Buschelman 220624c233c2SKris Buschelman /* Promote solution from float to double */ 220724c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 220824c233c2SKris Buschelman 220924c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 221024c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 221124c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 221224c233c2SKris Buschelman idc = 4*(*c--); 221324c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 221424c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 221524c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 221624c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 221724c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 221824c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 221924c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 222024c233c2SKris Buschelman SSE_INLINE_END_2 222124c233c2SKris Buschelman v = aa + ai16 + 16; 222224c233c2SKris Buschelman idt -= 4; 222324c233c2SKris Buschelman } 222424c233c2SKris Buschelman 222524c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 222624c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 22271ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 22281ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2229dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 223024c233c2SKris Buschelman SSE_SCOPE_END; 223124c233c2SKris Buschelman PetscFunctionReturn(0); 223224c233c2SKris Buschelman } 223324c233c2SKris Buschelman 223424c233c2SKris Buschelman #endif 22350ef38995SBarry Smith 22360ef38995SBarry Smith 22374e2b4712SSatish Balay /* 22384e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 22394e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 22404e2b4712SSatish Balay */ 22414a2ae208SSatish Balay #undef __FUNCT__ 22424a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2243dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 22444e2b4712SSatish Balay { 22454e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2246356650c2SBarry Smith PetscInt n=a->mbs; 2247356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 2248dfbe8321SBarry Smith PetscErrorCode ierr; 2249356650c2SBarry Smith const PetscInt *diag = a->diag; 2250d9fead3dSBarry Smith const MatScalar *aa=a->a; 2251d9fead3dSBarry Smith PetscScalar *x; 2252d9fead3dSBarry Smith const PetscScalar *b; 22534e2b4712SSatish Balay 22544e2b4712SSatish Balay PetscFunctionBegin; 2255d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22561ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22574e2b4712SSatish Balay 2258aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 22592853dc0eSBarry Smith { 226087828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 22612853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 22622853dc0eSBarry Smith } 2263aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 22642853dc0eSBarry Smith { 226587828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 22662853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 22672853dc0eSBarry Smith } 2268aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 22692853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2270e1293385SBarry Smith #else 227130d4dcafSBarry Smith { 227287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2273d9fead3dSBarry Smith const MatScalar *v; 2274356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 2275356650c2SBarry Smith const PetscInt *vi; 2276e1293385SBarry Smith 22774e2b4712SSatish Balay /* forward solve the lower triangular */ 22784e2b4712SSatish Balay idx = 0; 2279e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 22804e2b4712SSatish Balay for (i=1; i<n; i++) { 22814e2b4712SSatish Balay v = aa + 16*ai[i]; 22824e2b4712SSatish Balay vi = aj + ai[i]; 22834e2b4712SSatish Balay nz = diag[i] - ai[i]; 2284e1293385SBarry Smith idx += 4; 2285f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 22864e2b4712SSatish Balay while (nz--) { 22874e2b4712SSatish Balay jdx = 4*(*vi++); 22884e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2289f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2290f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2291f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2292f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 22934e2b4712SSatish Balay v += 16; 22944e2b4712SSatish Balay } 2295f1af5d2fSBarry Smith x[idx] = s1; 2296f1af5d2fSBarry Smith x[1+idx] = s2; 2297f1af5d2fSBarry Smith x[2+idx] = s3; 2298f1af5d2fSBarry Smith x[3+idx] = s4; 22994e2b4712SSatish Balay } 23004e2b4712SSatish Balay /* backward solve the upper triangular */ 23014e555682SBarry Smith idt = 4*(n-1); 23024e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 23034e555682SBarry Smith ai16 = 16*diag[i]; 23044e555682SBarry Smith v = aa + ai16 + 16; 23054e2b4712SSatish Balay vi = aj + diag[i] + 1; 23064e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 2307f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2308f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 23094e2b4712SSatish Balay while (nz--) { 23104e2b4712SSatish Balay idx = 4*(*vi++); 23114e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2312f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2313f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2314f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2315f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 23164e2b4712SSatish Balay v += 16; 23174e2b4712SSatish Balay } 23184e555682SBarry Smith v = aa + ai16; 2319f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2320f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2321f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2322f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2323329f5518SBarry Smith idt -= 4; 23244e2b4712SSatish Balay } 232530d4dcafSBarry Smith } 2326e1293385SBarry Smith #endif 23274e2b4712SSatish Balay 2328d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23291ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2330dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 23314e2b4712SSatish Balay PetscFunctionReturn(0); 23324e2b4712SSatish Balay } 23334e2b4712SSatish Balay 2334f26ec98cSKris Buschelman #undef __FUNCT__ 2335f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2336dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2337f26ec98cSKris Buschelman { 2338f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2339690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2340dfbe8321SBarry Smith PetscErrorCode ierr; 2341690b6cddSBarry Smith PetscInt *diag = a->diag; 2342f26ec98cSKris Buschelman MatScalar *aa=a->a; 2343f26ec98cSKris Buschelman PetscScalar *x,*b; 2344f26ec98cSKris Buschelman 2345f26ec98cSKris Buschelman PetscFunctionBegin; 23461ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 23471ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2348f26ec98cSKris Buschelman 2349f26ec98cSKris Buschelman { 2350f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2351f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 2352690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2353f26ec98cSKris Buschelman 2354f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2355f26ec98cSKris Buschelman idx = 0; 2356f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 2357f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 2358f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 2359f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 2360f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2361f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2362f26ec98cSKris Buschelman vi = aj + ai[i]; 2363f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2364f26ec98cSKris Buschelman idx += 4; 2365f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2366f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2367f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2368f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2369f26ec98cSKris Buschelman while (nz--) { 2370f26ec98cSKris Buschelman jdx = 4*(*vi++); 2371f26ec98cSKris Buschelman x1 = t[jdx]; 2372f26ec98cSKris Buschelman x2 = t[1+jdx]; 2373f26ec98cSKris Buschelman x3 = t[2+jdx]; 2374f26ec98cSKris Buschelman x4 = t[3+jdx]; 2375f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2376f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2377f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2378f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2379f26ec98cSKris Buschelman v += 16; 2380f26ec98cSKris Buschelman } 2381f26ec98cSKris Buschelman t[idx] = s1; 2382f26ec98cSKris Buschelman t[1+idx] = s2; 2383f26ec98cSKris Buschelman t[2+idx] = s3; 2384f26ec98cSKris Buschelman t[3+idx] = s4; 2385f26ec98cSKris Buschelman } 2386f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2387f26ec98cSKris Buschelman idt = 4*(n-1); 2388f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2389f26ec98cSKris Buschelman ai16 = 16*diag[i]; 2390f26ec98cSKris Buschelman v = aa + ai16 + 16; 2391f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2392f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2393f26ec98cSKris Buschelman s1 = t[idt]; 2394f26ec98cSKris Buschelman s2 = t[1+idt]; 2395f26ec98cSKris Buschelman s3 = t[2+idt]; 2396f26ec98cSKris Buschelman s4 = t[3+idt]; 2397f26ec98cSKris Buschelman while (nz--) { 2398f26ec98cSKris Buschelman idx = 4*(*vi++); 2399f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 2400f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 2401f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 2402f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 2403f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2404f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2405f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2406f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2407f26ec98cSKris Buschelman v += 16; 2408f26ec98cSKris Buschelman } 2409f26ec98cSKris Buschelman v = aa + ai16; 2410f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2411f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2412f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2413f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2414f26ec98cSKris Buschelman idt -= 4; 2415f26ec98cSKris Buschelman } 2416f26ec98cSKris Buschelman } 2417f26ec98cSKris Buschelman 24181ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 24191ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2420dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2421f26ec98cSKris Buschelman PetscFunctionReturn(0); 2422f26ec98cSKris Buschelman } 2423f26ec98cSKris Buschelman 24243660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 24253660e330SKris Buschelman 24263660e330SKris Buschelman #include PETSC_HAVE_SSE 24273660e330SKris Buschelman #undef __FUNCT__ 24287cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2429dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 24303660e330SKris Buschelman { 24313660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 24322aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 2433dfbe8321SBarry Smith PetscErrorCode ierr; 2434dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 24353660e330SKris Buschelman MatScalar *aa=a->a; 243687828ca2SBarry Smith PetscScalar *x,*b; 24373660e330SKris Buschelman 24383660e330SKris Buschelman PetscFunctionBegin; 24393660e330SKris Buschelman SSE_SCOPE_BEGIN; 24403660e330SKris Buschelman /* 24413660e330SKris Buschelman Note: This code currently uses demotion of double 24423660e330SKris Buschelman to float when performing the mixed-mode computation. 24433660e330SKris Buschelman This may not be numerically reasonable for all applications. 24443660e330SKris Buschelman */ 24453660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 24463660e330SKris Buschelman 24471ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 24481ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 24493660e330SKris Buschelman { 2450eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 2451eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 24522aa5897fSKris Buschelman int nz,i,idt,ai16; 24532aa5897fSKris Buschelman unsigned int jdx,idx; 24542aa5897fSKris Buschelman unsigned short *vi; 2455eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 24563660e330SKris Buschelman 2457eb05f457SKris Buschelman /* First block is the identity. */ 24583660e330SKris Buschelman idx = 0; 2459eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 24602aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 24613660e330SKris Buschelman 24623660e330SKris Buschelman for (i=1; i<n;) { 24633660e330SKris Buschelman PREFETCH_NTA(&v[8]); 24643660e330SKris Buschelman vi = aj + ai[i]; 24653660e330SKris Buschelman nz = diag[i] - ai[i]; 24663660e330SKris Buschelman idx += 4; 24673660e330SKris Buschelman 2468eb05f457SKris Buschelman /* Demote RHS from double to float. */ 2469eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2470eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 24713660e330SKris Buschelman 24723660e330SKris Buschelman while (nz--) { 24733660e330SKris Buschelman PREFETCH_NTA(&v[16]); 24742aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 24753660e330SKris Buschelman 24763660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 2477eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 24783660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 24793660e330SKris Buschelman 24803660e330SKris Buschelman /* First Column */ 24813660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 24823660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 24833660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 24843660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 24853660e330SKris Buschelman 24863660e330SKris Buschelman /* Second Column */ 24873660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 24883660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 24893660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 24903660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 24913660e330SKris Buschelman 24923660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 24933660e330SKris Buschelman 24943660e330SKris Buschelman /* Third Column */ 24953660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 24963660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 24973660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 24983660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 24993660e330SKris Buschelman 25003660e330SKris Buschelman /* Fourth Column */ 25013660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 25023660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 25033660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 25043660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 25053660e330SKris Buschelman SSE_INLINE_END_2 25063660e330SKris Buschelman 25073660e330SKris Buschelman v += 16; 25083660e330SKris Buschelman } 25093660e330SKris Buschelman v = aa + 16*ai[++i]; 25103660e330SKris Buschelman PREFETCH_NTA(v); 2511eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 25123660e330SKris Buschelman } 2513eb05f457SKris Buschelman 2514eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 2515eb05f457SKris Buschelman 25163660e330SKris Buschelman idt = 4*(n-1); 25173660e330SKris Buschelman ai16 = 16*diag[n-1]; 25183660e330SKris Buschelman v = aa + ai16 + 16; 25193660e330SKris Buschelman for (i=n-1; i>=0;){ 25203660e330SKris Buschelman PREFETCH_NTA(&v[8]); 25213660e330SKris Buschelman vi = aj + diag[i] + 1; 25223660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 25233660e330SKris Buschelman 2524eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 25253660e330SKris Buschelman 25263660e330SKris Buschelman while (nz--) { 25273660e330SKris Buschelman PREFETCH_NTA(&v[16]); 25282aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 25293660e330SKris Buschelman 25303660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 2531eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 25323660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 25333660e330SKris Buschelman 25343660e330SKris Buschelman /* First Column */ 25353660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 25363660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25373660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 25383660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 25393660e330SKris Buschelman 25403660e330SKris Buschelman /* Second Column */ 25413660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 25423660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 25433660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 25443660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 25453660e330SKris Buschelman 25463660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 25473660e330SKris Buschelman 25483660e330SKris Buschelman /* Third Column */ 25493660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 25503660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 25513660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 25523660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 25533660e330SKris Buschelman 25543660e330SKris Buschelman /* Fourth Column */ 25553660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 25563660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 25573660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 25583660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 25593660e330SKris Buschelman SSE_INLINE_END_2 25603660e330SKris Buschelman v += 16; 25613660e330SKris Buschelman } 25623660e330SKris Buschelman v = aa + ai16; 25633660e330SKris Buschelman ai16 = 16*diag[--i]; 25643660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 25653660e330SKris Buschelman /* 25663660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 25673660e330SKris Buschelman which was inverted as part of the factorization 25683660e330SKris Buschelman */ 2569eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 25703660e330SKris Buschelman /* First Column */ 25713660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 25723660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 25733660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 25743660e330SKris Buschelman 25753660e330SKris Buschelman /* Second Column */ 25763660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 25773660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 25783660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 25793660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 25803660e330SKris Buschelman 25813660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 25823660e330SKris Buschelman 25833660e330SKris Buschelman /* Third Column */ 25843660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 25853660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 25863660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 25873660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 25883660e330SKris Buschelman 25893660e330SKris Buschelman /* Fourth Column */ 25903660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 25913660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 25923660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 25933660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 25943660e330SKris Buschelman 25953660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 25963660e330SKris Buschelman SSE_INLINE_END_3 25973660e330SKris Buschelman 25983660e330SKris Buschelman v = aa + ai16 + 16; 25993660e330SKris Buschelman idt -= 4; 26003660e330SKris Buschelman } 2601eb05f457SKris Buschelman 2602eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 2603eb05f457SKris Buschelman idt = 4*(n-1); 2604eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 2605eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2606eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2607eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 2608eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 2609eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 2610eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 2611eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 2612eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 261354693613SKris Buschelman idt -= 4; 26143660e330SKris Buschelman } 2615eb05f457SKris Buschelman 2616eb05f457SKris Buschelman } /* End of artificial scope. */ 26171ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 26181ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2619dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 26203660e330SKris Buschelman SSE_SCOPE_END; 26213660e330SKris Buschelman PetscFunctionReturn(0); 26223660e330SKris Buschelman } 26233660e330SKris Buschelman 26247cf1b8d3SKris Buschelman #undef __FUNCT__ 26257cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2626dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 26277cf1b8d3SKris Buschelman { 26287cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 26297cf1b8d3SKris Buschelman int *aj=a->j; 2630dfbe8321SBarry Smith PetscErrorCode ierr; 2631dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 26327cf1b8d3SKris Buschelman MatScalar *aa=a->a; 26337cf1b8d3SKris Buschelman PetscScalar *x,*b; 26347cf1b8d3SKris Buschelman 26357cf1b8d3SKris Buschelman PetscFunctionBegin; 26367cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 26377cf1b8d3SKris Buschelman /* 26387cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 26397cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 26407cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 26417cf1b8d3SKris Buschelman */ 26427cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 26437cf1b8d3SKris Buschelman 26441ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 26451ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26467cf1b8d3SKris Buschelman { 26477cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 26487cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 26497cf1b8d3SKris Buschelman int nz,i,idt,ai16; 26507cf1b8d3SKris Buschelman int jdx,idx; 26517cf1b8d3SKris Buschelman int *vi; 26527cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 26537cf1b8d3SKris Buschelman 26547cf1b8d3SKris Buschelman /* First block is the identity. */ 26557cf1b8d3SKris Buschelman idx = 0; 26567cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 26577cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 26587cf1b8d3SKris Buschelman 26597cf1b8d3SKris Buschelman for (i=1; i<n;) { 26607cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 26617cf1b8d3SKris Buschelman vi = aj + ai[i]; 26627cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 26637cf1b8d3SKris Buschelman idx += 4; 26647cf1b8d3SKris Buschelman 26657cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 26667cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 26677cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 26687cf1b8d3SKris Buschelman 26697cf1b8d3SKris Buschelman while (nz--) { 26707cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 26717cf1b8d3SKris Buschelman jdx = 4*(*vi++); 26727cf1b8d3SKris Buschelman /* jdx = *vi++; */ 26737cf1b8d3SKris Buschelman 26747cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 26757cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 26767cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 26777cf1b8d3SKris Buschelman 26787cf1b8d3SKris Buschelman /* First Column */ 26797cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 26807cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 26817cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 26827cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 26837cf1b8d3SKris Buschelman 26847cf1b8d3SKris Buschelman /* Second Column */ 26857cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 26867cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 26877cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 26887cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 26897cf1b8d3SKris Buschelman 26907cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 26917cf1b8d3SKris Buschelman 26927cf1b8d3SKris Buschelman /* Third Column */ 26937cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 26947cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 26957cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 26967cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 26977cf1b8d3SKris Buschelman 26987cf1b8d3SKris Buschelman /* Fourth Column */ 26997cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 27007cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 27017cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 27027cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 27037cf1b8d3SKris Buschelman SSE_INLINE_END_2 27047cf1b8d3SKris Buschelman 27057cf1b8d3SKris Buschelman v += 16; 27067cf1b8d3SKris Buschelman } 27077cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 27087cf1b8d3SKris Buschelman PREFETCH_NTA(v); 27097cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 27107cf1b8d3SKris Buschelman } 27117cf1b8d3SKris Buschelman 27127cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 27137cf1b8d3SKris Buschelman 27147cf1b8d3SKris Buschelman idt = 4*(n-1); 27157cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 27167cf1b8d3SKris Buschelman v = aa + ai16 + 16; 27177cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 27187cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 27197cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 27207cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 27217cf1b8d3SKris Buschelman 27227cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 27237cf1b8d3SKris Buschelman 27247cf1b8d3SKris Buschelman while (nz--) { 27257cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 27267cf1b8d3SKris Buschelman idx = 4*(*vi++); 27277cf1b8d3SKris Buschelman /* idx = *vi++; */ 27287cf1b8d3SKris Buschelman 27297cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 27307cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 27317cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 27327cf1b8d3SKris Buschelman 27337cf1b8d3SKris Buschelman /* First Column */ 27347cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 27357cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 27367cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 27377cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 27387cf1b8d3SKris Buschelman 27397cf1b8d3SKris Buschelman /* Second Column */ 27407cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 27417cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 27427cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 27437cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 27447cf1b8d3SKris Buschelman 27457cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 27467cf1b8d3SKris Buschelman 27477cf1b8d3SKris Buschelman /* Third Column */ 27487cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 27497cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 27507cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 27517cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 27527cf1b8d3SKris Buschelman 27537cf1b8d3SKris Buschelman /* Fourth Column */ 27547cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 27557cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 27567cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 27577cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 27587cf1b8d3SKris Buschelman SSE_INLINE_END_2 27597cf1b8d3SKris Buschelman v += 16; 27607cf1b8d3SKris Buschelman } 27617cf1b8d3SKris Buschelman v = aa + ai16; 27627cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 27637cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 27647cf1b8d3SKris Buschelman /* 27657cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 27667cf1b8d3SKris Buschelman which was inverted as part of the factorization 27677cf1b8d3SKris Buschelman */ 27687cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 27697cf1b8d3SKris Buschelman /* First Column */ 27707cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 27717cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 27727cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 27737cf1b8d3SKris Buschelman 27747cf1b8d3SKris Buschelman /* Second Column */ 27757cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 27767cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 27777cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 27787cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 27797cf1b8d3SKris Buschelman 27807cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 27817cf1b8d3SKris Buschelman 27827cf1b8d3SKris Buschelman /* Third Column */ 27837cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 27847cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 27857cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 27867cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 27877cf1b8d3SKris Buschelman 27887cf1b8d3SKris Buschelman /* Fourth Column */ 27897cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 27907cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 27917cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 27927cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 27937cf1b8d3SKris Buschelman 27947cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 27957cf1b8d3SKris Buschelman SSE_INLINE_END_3 27967cf1b8d3SKris Buschelman 27977cf1b8d3SKris Buschelman v = aa + ai16 + 16; 27987cf1b8d3SKris Buschelman idt -= 4; 27997cf1b8d3SKris Buschelman } 28007cf1b8d3SKris Buschelman 28017cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 28027cf1b8d3SKris Buschelman idt = 4*(n-1); 28037cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 28047cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 28057cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 28067cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 28077cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 28087cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 28097cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 28107cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 28117cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 28127cf1b8d3SKris Buschelman idt -= 4; 28137cf1b8d3SKris Buschelman } 28147cf1b8d3SKris Buschelman 28157cf1b8d3SKris Buschelman } /* End of artificial scope. */ 28161ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 28171ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2818dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 28197cf1b8d3SKris Buschelman SSE_SCOPE_END; 28207cf1b8d3SKris Buschelman PetscFunctionReturn(0); 28217cf1b8d3SKris Buschelman } 28227cf1b8d3SKris Buschelman 28233660e330SKris Buschelman #endif 28244a2ae208SSatish Balay #undef __FUNCT__ 28254a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2826dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 28274e2b4712SSatish Balay { 28284e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 28294e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 28306849ba73SBarry Smith PetscErrorCode ierr; 28315d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 28325d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2833d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2834d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 2835d9fead3dSBarry Smith const PetscScalar *b; 28364e2b4712SSatish Balay 28374e2b4712SSatish Balay PetscFunctionBegin; 2838d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28391ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2840f1af5d2fSBarry Smith t = a->solve_work; 28414e2b4712SSatish Balay 28424e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 28434e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 28444e2b4712SSatish Balay 28454e2b4712SSatish Balay /* forward solve the lower triangular */ 28464e2b4712SSatish Balay idx = 3*(*r++); 2847f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 28484e2b4712SSatish Balay for (i=1; i<n; i++) { 28494e2b4712SSatish Balay v = aa + 9*ai[i]; 28504e2b4712SSatish Balay vi = aj + ai[i]; 28514e2b4712SSatish Balay nz = diag[i] - ai[i]; 28524e2b4712SSatish Balay idx = 3*(*r++); 2853f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 28544e2b4712SSatish Balay while (nz--) { 28554e2b4712SSatish Balay idx = 3*(*vi++); 2856f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2857f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2858f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2859f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 28604e2b4712SSatish Balay v += 9; 28614e2b4712SSatish Balay } 28624e2b4712SSatish Balay idx = 3*i; 2863f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 28644e2b4712SSatish Balay } 28654e2b4712SSatish Balay /* backward solve the upper triangular */ 28664e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 28674e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 28684e2b4712SSatish Balay vi = aj + diag[i] + 1; 28694e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 28704e2b4712SSatish Balay idt = 3*i; 2871f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 28724e2b4712SSatish Balay while (nz--) { 28734e2b4712SSatish Balay idx = 3*(*vi++); 2874f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2875f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2876f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2877f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 28784e2b4712SSatish Balay v += 9; 28794e2b4712SSatish Balay } 28804e2b4712SSatish Balay idc = 3*(*c--); 28814e2b4712SSatish Balay v = aa + 9*diag[i]; 2882f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2883f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2884f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 28854e2b4712SSatish Balay } 28864e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 28874e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2888d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28891ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2890dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 28914e2b4712SSatish Balay PetscFunctionReturn(0); 28924e2b4712SSatish Balay } 28934e2b4712SSatish Balay 289415091d37SBarry Smith /* 289515091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 289615091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 289715091d37SBarry Smith */ 28984a2ae208SSatish Balay #undef __FUNCT__ 28994a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2900dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 290115091d37SBarry Smith { 290215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2903690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2904dfbe8321SBarry Smith PetscErrorCode ierr; 2905690b6cddSBarry Smith PetscInt *diag = a->diag; 2906d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2907d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 2908d9fead3dSBarry Smith const PetscScalar *b; 2909690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 291015091d37SBarry Smith 291115091d37SBarry Smith PetscFunctionBegin; 2912d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 291415091d37SBarry Smith 291515091d37SBarry Smith /* forward solve the lower triangular */ 291615091d37SBarry Smith idx = 0; 291715091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 291815091d37SBarry Smith for (i=1; i<n; i++) { 291915091d37SBarry Smith v = aa + 9*ai[i]; 292015091d37SBarry Smith vi = aj + ai[i]; 292115091d37SBarry Smith nz = diag[i] - ai[i]; 292215091d37SBarry Smith idx += 3; 2923f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 292415091d37SBarry Smith while (nz--) { 292515091d37SBarry Smith jdx = 3*(*vi++); 292615091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2927f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2928f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2929f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 293015091d37SBarry Smith v += 9; 293115091d37SBarry Smith } 2932f1af5d2fSBarry Smith x[idx] = s1; 2933f1af5d2fSBarry Smith x[1+idx] = s2; 2934f1af5d2fSBarry Smith x[2+idx] = s3; 293515091d37SBarry Smith } 293615091d37SBarry Smith /* backward solve the upper triangular */ 293715091d37SBarry Smith for (i=n-1; i>=0; i--){ 293815091d37SBarry Smith v = aa + 9*diag[i] + 9; 293915091d37SBarry Smith vi = aj + diag[i] + 1; 294015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 294115091d37SBarry Smith idt = 3*i; 2942f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2943f1af5d2fSBarry Smith s3 = x[2+idt]; 294415091d37SBarry Smith while (nz--) { 294515091d37SBarry Smith idx = 3*(*vi++); 294615091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2947f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2948f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2949f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 295015091d37SBarry Smith v += 9; 295115091d37SBarry Smith } 295215091d37SBarry Smith v = aa + 9*diag[i]; 2953f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2954f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2955f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 295615091d37SBarry Smith } 295715091d37SBarry Smith 2958d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29591ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2960dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 296115091d37SBarry Smith PetscFunctionReturn(0); 296215091d37SBarry Smith } 296315091d37SBarry Smith 29644a2ae208SSatish Balay #undef __FUNCT__ 29654a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2966dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 29674e2b4712SSatish Balay { 29684e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 29694e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 29706849ba73SBarry Smith PetscErrorCode ierr; 29715d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 29725d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2973d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2974d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 2975d9fead3dSBarry Smith const PetscScalar *b; 29764e2b4712SSatish Balay 29774e2b4712SSatish Balay PetscFunctionBegin; 2978d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2980f1af5d2fSBarry Smith t = a->solve_work; 29814e2b4712SSatish Balay 29824e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 29834e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 29844e2b4712SSatish Balay 29854e2b4712SSatish Balay /* forward solve the lower triangular */ 29864e2b4712SSatish Balay idx = 2*(*r++); 2987f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 29884e2b4712SSatish Balay for (i=1; i<n; i++) { 29894e2b4712SSatish Balay v = aa + 4*ai[i]; 29904e2b4712SSatish Balay vi = aj + ai[i]; 29914e2b4712SSatish Balay nz = diag[i] - ai[i]; 29924e2b4712SSatish Balay idx = 2*(*r++); 2993f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 29944e2b4712SSatish Balay while (nz--) { 29954e2b4712SSatish Balay idx = 2*(*vi++); 2996f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2997f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 2998f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 29994e2b4712SSatish Balay v += 4; 30004e2b4712SSatish Balay } 30014e2b4712SSatish Balay idx = 2*i; 3002f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 30034e2b4712SSatish Balay } 30044e2b4712SSatish Balay /* backward solve the upper triangular */ 30054e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 30064e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 30074e2b4712SSatish Balay vi = aj + diag[i] + 1; 30084e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 30094e2b4712SSatish Balay idt = 2*i; 3010f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 30114e2b4712SSatish Balay while (nz--) { 30124e2b4712SSatish Balay idx = 2*(*vi++); 3013f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3014f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3015f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 30164e2b4712SSatish Balay v += 4; 30174e2b4712SSatish Balay } 30184e2b4712SSatish Balay idc = 2*(*c--); 30194e2b4712SSatish Balay v = aa + 4*diag[i]; 3020f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 3021f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 30224e2b4712SSatish Balay } 30234e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 30244e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3025d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30261ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3027dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 30284e2b4712SSatish Balay PetscFunctionReturn(0); 30294e2b4712SSatish Balay } 30304e2b4712SSatish Balay 303115091d37SBarry Smith /* 303215091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 303315091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 303415091d37SBarry Smith */ 30354a2ae208SSatish Balay #undef __FUNCT__ 30364a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 3037dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 303815091d37SBarry Smith { 303915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3040690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3041dfbe8321SBarry Smith PetscErrorCode ierr; 3042690b6cddSBarry Smith PetscInt *diag = a->diag; 3043d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3044d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 3045d9fead3dSBarry Smith const PetscScalar *b; 3046690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 304715091d37SBarry Smith 304815091d37SBarry Smith PetscFunctionBegin; 3049d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30501ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 305115091d37SBarry Smith 305215091d37SBarry Smith /* forward solve the lower triangular */ 305315091d37SBarry Smith idx = 0; 305415091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 305515091d37SBarry Smith for (i=1; i<n; i++) { 305615091d37SBarry Smith v = aa + 4*ai[i]; 305715091d37SBarry Smith vi = aj + ai[i]; 305815091d37SBarry Smith nz = diag[i] - ai[i]; 305915091d37SBarry Smith idx += 2; 3060f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 306115091d37SBarry Smith while (nz--) { 306215091d37SBarry Smith jdx = 2*(*vi++); 306315091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 3064f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3065f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 306615091d37SBarry Smith v += 4; 306715091d37SBarry Smith } 3068f1af5d2fSBarry Smith x[idx] = s1; 3069f1af5d2fSBarry Smith x[1+idx] = s2; 307015091d37SBarry Smith } 307115091d37SBarry Smith /* backward solve the upper triangular */ 307215091d37SBarry Smith for (i=n-1; i>=0; i--){ 307315091d37SBarry Smith v = aa + 4*diag[i] + 4; 307415091d37SBarry Smith vi = aj + diag[i] + 1; 307515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 307615091d37SBarry Smith idt = 2*i; 3077f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 307815091d37SBarry Smith while (nz--) { 307915091d37SBarry Smith idx = 2*(*vi++); 308015091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 3081f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 3082f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 308315091d37SBarry Smith v += 4; 308415091d37SBarry Smith } 308515091d37SBarry Smith v = aa + 4*diag[i]; 3086f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 3087f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 308815091d37SBarry Smith } 308915091d37SBarry Smith 3090d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30911ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3092dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 309315091d37SBarry Smith PetscFunctionReturn(0); 309415091d37SBarry Smith } 309515091d37SBarry Smith 30964a2ae208SSatish Balay #undef __FUNCT__ 30974a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 3098dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 30994e2b4712SSatish Balay { 31004e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 31014e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 31026849ba73SBarry Smith PetscErrorCode ierr; 31035d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 31045d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 31053f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 310687828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 31074e2b4712SSatish Balay 31084e2b4712SSatish Balay PetscFunctionBegin; 31094e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 31104e2b4712SSatish Balay 31111ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 31121ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3113f1af5d2fSBarry Smith t = a->solve_work; 31144e2b4712SSatish Balay 31154e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 31164e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 31174e2b4712SSatish Balay 31184e2b4712SSatish Balay /* forward solve the lower triangular */ 3119f1af5d2fSBarry Smith t[0] = b[*r++]; 31204e2b4712SSatish Balay for (i=1; i<n; i++) { 31214e2b4712SSatish Balay v = aa + ai[i]; 31224e2b4712SSatish Balay vi = aj + ai[i]; 31234e2b4712SSatish Balay nz = diag[i] - ai[i]; 3124f1af5d2fSBarry Smith s1 = b[*r++]; 31254e2b4712SSatish Balay while (nz--) { 3126f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 31274e2b4712SSatish Balay } 3128f1af5d2fSBarry Smith t[i] = s1; 31294e2b4712SSatish Balay } 31304e2b4712SSatish Balay /* backward solve the upper triangular */ 31314e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 31324e2b4712SSatish Balay v = aa + diag[i] + 1; 31334e2b4712SSatish Balay vi = aj + diag[i] + 1; 31344e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 3135f1af5d2fSBarry Smith s1 = t[i]; 31364e2b4712SSatish Balay while (nz--) { 3137f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 31384e2b4712SSatish Balay } 3139f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 31404e2b4712SSatish Balay } 31414e2b4712SSatish Balay 31424e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 31434e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 31441ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 31451ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3146dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 31474e2b4712SSatish Balay PetscFunctionReturn(0); 31484e2b4712SSatish Balay } 314915091d37SBarry Smith /* 315015091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 315115091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 315215091d37SBarry Smith */ 31534a2ae208SSatish Balay #undef __FUNCT__ 31544a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3155dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 315615091d37SBarry Smith { 315715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3158690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3159dfbe8321SBarry Smith PetscErrorCode ierr; 3160690b6cddSBarry Smith PetscInt *diag = a->diag; 316115091d37SBarry Smith MatScalar *aa=a->a; 316287828ca2SBarry Smith PetscScalar *x,*b; 316387828ca2SBarry Smith PetscScalar s1,x1; 316415091d37SBarry Smith MatScalar *v; 3165690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 316615091d37SBarry Smith 316715091d37SBarry Smith PetscFunctionBegin; 31681ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 31691ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 317015091d37SBarry Smith 317115091d37SBarry Smith /* forward solve the lower triangular */ 317215091d37SBarry Smith idx = 0; 317315091d37SBarry Smith x[0] = b[0]; 317415091d37SBarry Smith for (i=1; i<n; i++) { 317515091d37SBarry Smith v = aa + ai[i]; 317615091d37SBarry Smith vi = aj + ai[i]; 317715091d37SBarry Smith nz = diag[i] - ai[i]; 317815091d37SBarry Smith idx += 1; 3179f1af5d2fSBarry Smith s1 = b[idx]; 318015091d37SBarry Smith while (nz--) { 318115091d37SBarry Smith jdx = *vi++; 318215091d37SBarry Smith x1 = x[jdx]; 3183f1af5d2fSBarry Smith s1 -= v[0]*x1; 318415091d37SBarry Smith v += 1; 318515091d37SBarry Smith } 3186f1af5d2fSBarry Smith x[idx] = s1; 318715091d37SBarry Smith } 318815091d37SBarry Smith /* backward solve the upper triangular */ 318915091d37SBarry Smith for (i=n-1; i>=0; i--){ 319015091d37SBarry Smith v = aa + diag[i] + 1; 319115091d37SBarry Smith vi = aj + diag[i] + 1; 319215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 319315091d37SBarry Smith idt = i; 3194f1af5d2fSBarry Smith s1 = x[idt]; 319515091d37SBarry Smith while (nz--) { 319615091d37SBarry Smith idx = *vi++; 319715091d37SBarry Smith x1 = x[idx]; 3198f1af5d2fSBarry Smith s1 -= v[0]*x1; 319915091d37SBarry Smith v += 1; 320015091d37SBarry Smith } 320115091d37SBarry Smith v = aa + diag[i]; 3202f1af5d2fSBarry Smith x[idt] = v[0]*s1; 320315091d37SBarry Smith } 32041ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 32051ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3206dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 320715091d37SBarry Smith PetscFunctionReturn(0); 320815091d37SBarry Smith } 32094e2b4712SSatish Balay 32104e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 32116bce7ff8SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption); 32126bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 32136bce7ff8SHong Zhang 321484a281e5SHong Zhang extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 32156bce7ff8SHong Zhang #undef __FUNCT__ 32166bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 32176bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 32186bce7ff8SHong Zhang { 32196bce7ff8SHong Zhang Mat C=B; 32206bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 32216bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 32226bce7ff8SHong Zhang PetscErrorCode ierr; 32236bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 32246bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 32256bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 3226914a18a2SHong Zhang MatScalar *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a; 3227914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 3228914a18a2SHong Zhang MatScalar *v_work; 32296bce7ff8SHong Zhang 32306bce7ff8SHong Zhang PetscFunctionBegin; 32316bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 32326bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3233914a18a2SHong Zhang ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 3234914a18a2SHong Zhang ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 32356bce7ff8SHong Zhang ics = ic; 32366bce7ff8SHong Zhang 3237914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 3238914a18a2SHong Zhang ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 3239914a18a2SHong Zhang multiplier = v_work + bs; 3240914a18a2SHong Zhang v_pivots = (PetscInt*)(multiplier + bs2); 3241914a18a2SHong Zhang 32426bce7ff8SHong Zhang for (i=0; i<n; i++){ 32436bce7ff8SHong Zhang /* zero rtmp */ 32446bce7ff8SHong Zhang /* L part */ 32456bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 32466bce7ff8SHong Zhang bjtmp = bj + bi[i]; 3247914a18a2SHong Zhang for (j=0; j<nz; j++){ 3248914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3249914a18a2SHong Zhang } 32506bce7ff8SHong Zhang 32516bce7ff8SHong Zhang /* U part */ 32526bce7ff8SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i]; 32536bce7ff8SHong Zhang bjtmp = bj + bi[2*n-i]; 3254914a18a2SHong Zhang for (j=0; j<nz; j++){ 3255914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3256914a18a2SHong Zhang } 32576bce7ff8SHong Zhang 32586bce7ff8SHong Zhang /* load in initial (unfactored row) */ 32596bce7ff8SHong Zhang nz = ai[r[i]+1] - ai[r[i]]; 32606bce7ff8SHong Zhang ajtmp = aj + ai[r[i]]; 3261914a18a2SHong Zhang v = aa + bs2*ai[r[i]]; 32626bce7ff8SHong Zhang for (j=0; j<nz; j++) { 3263914a18a2SHong Zhang ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 32646bce7ff8SHong Zhang } 32656bce7ff8SHong Zhang 32666bce7ff8SHong Zhang /* elimination */ 32676bce7ff8SHong Zhang bjtmp = bj + bi[i]; 32686bce7ff8SHong Zhang row = *bjtmp++; 32696bce7ff8SHong Zhang nzL = bi[i+1] - bi[i]; 32706bce7ff8SHong Zhang k = 0; 32716bce7ff8SHong Zhang while (k < nzL) { 3272914a18a2SHong Zhang pc = rtmp + bs2*row; 3273914a18a2SHong Zhang for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 3274914a18a2SHong Zhang if (flg) { 3275914a18a2SHong Zhang pv = b->a + bs2*bdiag[row]; 3276914a18a2SHong Zhang Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */ 32776bce7ff8SHong Zhang pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 3278914a18a2SHong Zhang pv = b->a + bs2*bi[2*n-row]; 32796bce7ff8SHong Zhang nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 3280914a18a2SHong Zhang for (j=0; j<nz; j++) { 3281914a18a2SHong Zhang Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 3282914a18a2SHong Zhang } 32836bce7ff8SHong Zhang ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 32846bce7ff8SHong Zhang } 32856bce7ff8SHong Zhang row = *bjtmp++; k++; 32866bce7ff8SHong Zhang } 32876bce7ff8SHong Zhang 32886bce7ff8SHong Zhang /* finished row so stick it into b->a */ 32896bce7ff8SHong Zhang /* L part */ 3290914a18a2SHong Zhang pv = b->a + bs2*bi[i] ; 32916bce7ff8SHong Zhang pj = b->j + bi[i] ; 32926bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 32936bce7ff8SHong Zhang for (j=0; j<nz; j++) { 3294914a18a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 32956bce7ff8SHong Zhang } 32966bce7ff8SHong Zhang 32976bce7ff8SHong Zhang /* Mark diagonal and invert diagonal for simplier triangular solves */ 3298914a18a2SHong Zhang pv = b->a + bs2*bdiag[i]; 32996bce7ff8SHong Zhang pj = b->j + bdiag[i]; 3300914a18a2SHong Zhang /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 3301914a18a2SHong Zhang ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3302914a18a2SHong Zhang ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 33036bce7ff8SHong Zhang 33046bce7ff8SHong Zhang /* U part */ 3305914a18a2SHong Zhang pv = b->a + bs2*bi[2*n-i]; 33066bce7ff8SHong Zhang pj = b->j + bi[2*n-i]; 33076bce7ff8SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i] - 1; 3308914a18a2SHong Zhang for (j=0; j<nz; j++){ 3309914a18a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3310914a18a2SHong Zhang } 33116bce7ff8SHong Zhang } 33126bce7ff8SHong Zhang 33136bce7ff8SHong Zhang ierr = PetscFree(rtmp);CHKERRQ(ierr); 33146bce7ff8SHong Zhang ierr = PetscFree(v_work);CHKERRQ(ierr); 33156bce7ff8SHong Zhang ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 33166bce7ff8SHong Zhang ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3317*27019359SHong Zhang 3318*27019359SHong Zhang switch (A->rmap->bs){ 3319*27019359SHong Zhang case 2: 3320*27019359SHong Zhang C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 3321*27019359SHong Zhang break; 3322*27019359SHong Zhang 3323*27019359SHong Zhang case 5: 332484a281e5SHong Zhang C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 3325*27019359SHong Zhang break; 3326*27019359SHong Zhang default: 332784a281e5SHong Zhang C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 3328*27019359SHong Zhang break; 332984a281e5SHong Zhang } 33306bce7ff8SHong Zhang C->assembled = PETSC_TRUE; 3331914a18a2SHong Zhang ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 33326bce7ff8SHong Zhang PetscFunctionReturn(0); 33336bce7ff8SHong Zhang } 33346bce7ff8SHong Zhang 33356bce7ff8SHong Zhang /* 33366bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 33376bce7ff8SHong Zhang Factored arrays bj and ba are stored as 33386bce7ff8SHong Zhang L(0,:), L(1,:), ...,L(n-1,:), U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:) 33396bce7ff8SHong Zhang 33406bce7ff8SHong Zhang bi=fact->i is an array of size 2n+2, in which 33416bce7ff8SHong Zhang bi+ 33426bce7ff8SHong Zhang bi[i] -> 1st entry of L(i,:),i=0,...,i-1 33436bce7ff8SHong Zhang bi[n] -> end of L(n-1,:)+1 33446bce7ff8SHong Zhang bi[n+1] -> 1st entry of U(n-1,:) 33456bce7ff8SHong Zhang bi[2n-i] -> 1st entry of U(i,:) 33466bce7ff8SHong Zhang bi[2n-i+1] -> end of U(i,:)+1, the 1st entry of U(i-1,:) 33476bce7ff8SHong Zhang bi[2n] -> end of U(0,:)+1 33486bce7ff8SHong Zhang 33496bce7ff8SHong Zhang U(i,:) contains diag[i] as its last entry, i.e., 33506bce7ff8SHong Zhang U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i]) 33516bce7ff8SHong Zhang */ 33526bce7ff8SHong Zhang #undef __FUNCT__ 33536bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 33546bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 33556bce7ff8SHong Zhang { 33566bce7ff8SHong Zhang 33576bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 33586bce7ff8SHong Zhang PetscErrorCode ierr; 3359914a18a2SHong Zhang PetscInt mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 33606bce7ff8SHong Zhang PetscInt i,j,nz=a->nz,*bi,*bj,*bdiag; 33616bce7ff8SHong Zhang 33626bce7ff8SHong Zhang PetscFunctionBegin; 33636bce7ff8SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 33646bce7ff8SHong Zhang b = (Mat_SeqBAIJ*)(fact)->data; 3365914a18a2SHong Zhang bdiag = b->diag; 33666bce7ff8SHong Zhang 33676bce7ff8SHong Zhang /* replace matrix arrays with single allocations, then reset values */ 33686bce7ff8SHong Zhang ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 33696bce7ff8SHong Zhang 33706bce7ff8SHong Zhang ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr); 33716bce7ff8SHong Zhang ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr); 33726bce7ff8SHong Zhang ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr); 33736bce7ff8SHong Zhang b->singlemalloc = PETSC_FALSE; 33746bce7ff8SHong Zhang if (mbs > 0) { 33756bce7ff8SHong Zhang ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr); 33766bce7ff8SHong Zhang } 33776bce7ff8SHong Zhang 33786bce7ff8SHong Zhang /* set bi and bj with new data structure */ 33796bce7ff8SHong Zhang bi = b->i; 33806bce7ff8SHong Zhang bj = b->j; 33816bce7ff8SHong Zhang 33826bce7ff8SHong Zhang /* L part */ 33836bce7ff8SHong Zhang bi[0] = 0; 33846bce7ff8SHong Zhang for (i=0; i<mbs; i++){ 33856bce7ff8SHong Zhang nz = adiag[i] - ai[i]; 3386914a18a2SHong Zhang bi[i+1] = bi[i] + nz; 33876bce7ff8SHong Zhang aj = a->j + ai[i]; 33886bce7ff8SHong Zhang for (j=0; j<nz; j++){ 33896bce7ff8SHong Zhang *bj = aj[j]; bj++; 33906bce7ff8SHong Zhang } 33916bce7ff8SHong Zhang } 33926bce7ff8SHong Zhang 33936bce7ff8SHong Zhang /* U part */ 33946bce7ff8SHong Zhang bi[mbs+1] = bi[mbs]; 33956bce7ff8SHong Zhang for (i=mbs-1; i>=0; i--){ 33966bce7ff8SHong Zhang nz = ai[i+1] - adiag[i] - 1; 33976bce7ff8SHong Zhang if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz); 3398914a18a2SHong Zhang bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1; 33996bce7ff8SHong Zhang aj = a->j + adiag[i] + 1; 34006bce7ff8SHong Zhang for (j=0; j<nz; j++){ 34016bce7ff8SHong Zhang *bj = aj[j]; bj++; 34026bce7ff8SHong Zhang } 34036bce7ff8SHong Zhang /* diag[i] */ 34046bce7ff8SHong Zhang *bj = i; bj++; 34056bce7ff8SHong Zhang bdiag[i] = bi[2*mbs-i+1]-1; 34066bce7ff8SHong Zhang } 34076bce7ff8SHong Zhang PetscFunctionReturn(0); 34086bce7ff8SHong Zhang } 34096bce7ff8SHong Zhang 34104e2b4712SSatish Balay /* 34114e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 34124e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 34134e2b4712SSatish Balay Not a good example of code reuse. 34144e2b4712SSatish Balay */ 3415435faa5fSBarry Smith 34164a2ae208SSatish Balay #undef __FUNCT__ 34174a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 34180481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 34194e2b4712SSatish Balay { 34204e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 34214e2b4712SSatish Balay IS isicol; 34226849ba73SBarry Smith PetscErrorCode ierr; 34235d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 34245d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 3425a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3426d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 342741df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 3428329f5518SBarry Smith PetscReal f; 34294e2b4712SSatish Balay 34304e2b4712SSatish Balay PetscFunctionBegin; 34316bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 34326bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 34336bce7ff8SHong Zhang 3434435faa5fSBarry Smith f = info->fill; 3435690b6cddSBarry Smith levels = (PetscInt)info->levels; 3436690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 34374c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3438667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3439667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 34407d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 3441309c388cSBarry Smith 344241df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 34436bce7ff8SHong Zhang 34446bce7ff8SHong Zhang PetscTruth newdatastruct=PETSC_FALSE; 34456bce7ff8SHong Zhang ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 34466bce7ff8SHong Zhang if (newdatastruct){ 34476bce7ff8SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 34486bce7ff8SHong Zhang (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 34496bce7ff8SHong Zhang } else { 3450719d5645SBarry Smith ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 34516bce7ff8SHong Zhang ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 34526bce7ff8SHong Zhang } 34536bce7ff8SHong Zhang 3454719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 3455719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 3456bb3d539aSBarry Smith b->row = isrow; 3457bb3d539aSBarry Smith b->col = iscol; 3458bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3459bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3460bb3d539aSBarry Smith b->icol = isicol; 3461bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3462719d5645SBarry Smith ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 34636bce7ff8SHong Zhang PetscFunctionReturn(0); 34646bce7ff8SHong Zhang } 34656bce7ff8SHong Zhang 34666bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 34674e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 34684e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 34694e2b4712SSatish Balay 34704e2b4712SSatish Balay /* get new row pointers */ 3471690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 34724e2b4712SSatish Balay ainew[0] = 0; 34734e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 3474690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 3475690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 34764e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 3477690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 34784e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 3479690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 34804e2b4712SSatish Balay /* im is level for each filled value */ 3481690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 34824e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 3483690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 34844e2b4712SSatish Balay dloc[0] = 0; 34854e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 3486435faa5fSBarry Smith 3487435faa5fSBarry Smith /* copy prow into linked list */ 34884e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 34893b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 34904e2b4712SSatish Balay xi = aj + ai[r[prow]]; 34914e2b4712SSatish Balay fill[n] = n; 3492435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 34934e2b4712SSatish Balay while (nz--) { 34944e2b4712SSatish Balay fm = n; 34954e2b4712SSatish Balay idx = ic[*xi++]; 34964e2b4712SSatish Balay do { 34974e2b4712SSatish Balay m = fm; 34984e2b4712SSatish Balay fm = fill[m]; 34994e2b4712SSatish Balay } while (fm < idx); 35004e2b4712SSatish Balay fill[m] = idx; 35014e2b4712SSatish Balay fill[idx] = fm; 35024e2b4712SSatish Balay im[idx] = 0; 35034e2b4712SSatish Balay } 3504435faa5fSBarry Smith 3505435faa5fSBarry Smith /* make sure diagonal entry is included */ 3506435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 3507435faa5fSBarry Smith fm = n; 3508435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 3509435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3510435faa5fSBarry Smith fill[fm] = prow; 3511435faa5fSBarry Smith im[prow] = 0; 3512435faa5fSBarry Smith nzf++; 3513335d9088SBarry Smith dcount++; 3514435faa5fSBarry Smith } 3515435faa5fSBarry Smith 35164e2b4712SSatish Balay nzi = 0; 35174e2b4712SSatish Balay row = fill[n]; 35184e2b4712SSatish Balay while (row < prow) { 35194e2b4712SSatish Balay incrlev = im[row] + 1; 35204e2b4712SSatish Balay nz = dloc[row]; 3521435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 35224e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 35234e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 35244e2b4712SSatish Balay fm = row; 35254e2b4712SSatish Balay while (nnz-- > 0) { 35264e2b4712SSatish Balay idx = *xi++; 35274e2b4712SSatish Balay if (*flev + incrlev > levels) { 35284e2b4712SSatish Balay flev++; 35294e2b4712SSatish Balay continue; 35304e2b4712SSatish Balay } 35314e2b4712SSatish Balay do { 35324e2b4712SSatish Balay m = fm; 35334e2b4712SSatish Balay fm = fill[m]; 35344e2b4712SSatish Balay } while (fm < idx); 35354e2b4712SSatish Balay if (fm != idx) { 35364e2b4712SSatish Balay im[idx] = *flev + incrlev; 35374e2b4712SSatish Balay fill[m] = idx; 35384e2b4712SSatish Balay fill[idx] = fm; 35394e2b4712SSatish Balay fm = idx; 35404e2b4712SSatish Balay nzf++; 3541ecf371e4SBarry Smith } else { 35424e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 35434e2b4712SSatish Balay } 35444e2b4712SSatish Balay flev++; 35454e2b4712SSatish Balay } 35464e2b4712SSatish Balay row = fill[row]; 35474e2b4712SSatish Balay nzi++; 35484e2b4712SSatish Balay } 35494e2b4712SSatish Balay /* copy new filled row into permanent storage */ 35504e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 35514e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 3552ecf371e4SBarry Smith 3553ecf371e4SBarry Smith /* estimate how much additional space we will need */ 3554ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3555ecf371e4SBarry Smith /* just double the memory each time */ 3556690b6cddSBarry Smith PetscInt maxadd = jmax; 3557ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 35584e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 35594e2b4712SSatish Balay jmax += maxadd; 3560ecf371e4SBarry Smith 3561ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 35625d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 35635d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3564606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 35655d0c19d7SBarry Smith ajnew = xitmp; 35665d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 35675d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3568606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 35695d0c19d7SBarry Smith ajfill = xitmp; 3570eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 35714e2b4712SSatish Balay } 35725d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 35734e2b4712SSatish Balay flev = ajfill + ainew[prow]; 35744e2b4712SSatish Balay dloc[prow] = nzi; 35754e2b4712SSatish Balay fm = fill[n]; 35764e2b4712SSatish Balay while (nzf--) { 35775d0c19d7SBarry Smith *xitmp++ = fm; 35784e2b4712SSatish Balay *flev++ = im[fm]; 35794e2b4712SSatish Balay fm = fill[fm]; 35804e2b4712SSatish Balay } 3581435faa5fSBarry Smith /* make sure row has diagonal entry */ 3582435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 358377431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 35842401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 3585435faa5fSBarry Smith } 35864e2b4712SSatish Balay } 3587606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 35884e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 35894e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3590606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 3591606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 35924e2b4712SSatish Balay 35936cf91177SBarry Smith #if defined(PETSC_USE_INFO) 35944e2b4712SSatish Balay { 3595329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3596ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 3597ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3598ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 3599ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3600335d9088SBarry Smith if (diagonal_fill) { 3601ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 3602335d9088SBarry Smith } 36034e2b4712SSatish Balay } 360463ba0a88SBarry Smith #endif 36054e2b4712SSatish Balay 36064e2b4712SSatish Balay /* put together the new matrix */ 3607719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3608719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 3609719d5645SBarry Smith b = (Mat_SeqBAIJ*)(fact)->data; 3610e6b907acSBarry Smith b->free_a = PETSC_TRUE; 3611e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 36127c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 3613a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 36144e2b4712SSatish Balay b->j = ajnew; 36154e2b4712SSatish Balay b->i = ainew; 36164e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 36174e2b4712SSatish Balay b->diag = dloc; 36184e2b4712SSatish Balay b->ilen = 0; 36194e2b4712SSatish Balay b->imax = 0; 36204e2b4712SSatish Balay b->row = isrow; 36214e2b4712SSatish Balay b->col = iscol; 3622bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3623c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3624c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3625e51c0b9cSSatish Balay b->icol = isicol; 362687828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 36274e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 36284e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 3629719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 36304e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 36314e2b4712SSatish Balay 3632719d5645SBarry Smith (fact)->info.factor_mallocs = reallocate; 3633719d5645SBarry Smith (fact)->info.fill_ratio_given = f; 3634719d5645SBarry Smith (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 36356bce7ff8SHong Zhang 363641df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 36378661488fSKris Buschelman PetscFunctionReturn(0); 36388661488fSKris Buschelman } 36398661488fSKris Buschelman 3640732ee342SKris Buschelman #undef __FUNCT__ 36417e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3642dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 36437e7071cdSKris Buschelman { 364412272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 364512272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 36465a9542e3SKris Buschelman PetscFunctionBegin; 36477cf1b8d3SKris Buschelman /* Undo Column scaling */ 36487cf1b8d3SKris Buschelman /* while (nz--) { */ 36497cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 36507cf1b8d3SKris Buschelman /* } */ 3651c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 3652c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 36537cf1b8d3SKris Buschelman PetscFunctionReturn(0); 36547cf1b8d3SKris Buschelman } 36557cf1b8d3SKris Buschelman 36567cf1b8d3SKris Buschelman #undef __FUNCT__ 36577cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3658dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 36597cf1b8d3SKris Buschelman { 36607cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3661b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 36622aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 36635a9542e3SKris Buschelman PetscFunctionBegin; 36640b9da03eSKris Buschelman /* Is this really necessary? */ 366520235379SKris Buschelman while (nz--) { 36660b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 36677e7071cdSKris Buschelman } 3668c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 36697e7071cdSKris Buschelman PetscFunctionReturn(0); 36707e7071cdSKris Buschelman } 36717e7071cdSKris Buschelman 3672732ee342SKris Buschelman 3673