1be1d678aSKris Buschelman 24e2b4712SSatish Balay /* 34e2b4712SSatish Balay Factorization code for BAIJ format. 44e2b4712SSatish Balay */ 54e2b4712SSatish Balay 6c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 7c6db04a5SJed Brown #include <../src/mat/blockinvert.h> 8c6db04a5SJed Brown #include <petscbt.h> 9c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 104e2b4712SSatish Balay 114a2ae208SSatish Balay #undef __FUNCT__ 1293fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 1393fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 1493fd935bSShri Abhyankar { 1593fd935bSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1693fd935bSShri Abhyankar PetscErrorCode ierr; 1793fd935bSShri Abhyankar const PetscInt *adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 1893fd935bSShri Abhyankar PetscInt i,n = a->mbs,j; 1993fd935bSShri Abhyankar PetscInt nz; 2093fd935bSShri Abhyankar PetscScalar *x,*tmp,s1; 2193fd935bSShri Abhyankar const MatScalar *aa = a->a,*v; 2293fd935bSShri Abhyankar const PetscScalar *b; 2393fd935bSShri Abhyankar 2493fd935bSShri Abhyankar PetscFunctionBegin; 253649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2693fd935bSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2793fd935bSShri Abhyankar tmp = a->solve_work; 2893fd935bSShri Abhyankar 2993fd935bSShri Abhyankar 3093fd935bSShri Abhyankar /* copy the b into temp work space according to permutation */ 3193fd935bSShri Abhyankar for (i=0; i<n; i++) tmp[i] = b[i]; 3293fd935bSShri Abhyankar 3393fd935bSShri Abhyankar /* forward solve the U^T */ 3493fd935bSShri Abhyankar for (i=0; i<n; i++) { 3593fd935bSShri Abhyankar v = aa + adiag[i+1] + 1; 3693fd935bSShri Abhyankar vi = aj + adiag[i+1] + 1; 3793fd935bSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 3893fd935bSShri Abhyankar s1 = tmp[i]; 3993fd935bSShri Abhyankar s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 4093fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 4193fd935bSShri Abhyankar tmp[i] = s1; 4293fd935bSShri Abhyankar } 4393fd935bSShri Abhyankar 4493fd935bSShri Abhyankar /* backward solve the L^T */ 4593fd935bSShri Abhyankar for (i=n-1; i>=0; i--) { 4693fd935bSShri Abhyankar v = aa + ai[i]; 4793fd935bSShri Abhyankar vi = aj + ai[i]; 4893fd935bSShri Abhyankar nz = ai[i+1] - ai[i]; 4993fd935bSShri Abhyankar s1 = tmp[i]; 5093fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 5193fd935bSShri Abhyankar } 5293fd935bSShri Abhyankar 5393fd935bSShri Abhyankar /* copy tmp into x according to permutation */ 5493fd935bSShri Abhyankar for (i=0; i<n; i++) x[i] = tmp[i]; 5593fd935bSShri Abhyankar 563649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5793fd935bSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5893fd935bSShri Abhyankar 5993fd935bSShri Abhyankar ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 6093fd935bSShri Abhyankar PetscFunctionReturn(0); 6193fd935bSShri Abhyankar } 6293fd935bSShri Abhyankar 6393fd935bSShri Abhyankar #undef __FUNCT__ 6406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 6506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 66f1af5d2fSBarry Smith { 67f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 68dfbe8321SBarry Smith PetscErrorCode ierr; 690b68f018SBarry Smith PetscInt i,nz; 700b68f018SBarry Smith const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 710b68f018SBarry Smith const MatScalar *aa=a->a,*v; 720b68f018SBarry Smith PetscScalar s1,*x; 73f1af5d2fSBarry Smith 74f1af5d2fSBarry Smith PetscFunctionBegin; 75ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 761ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 77f1af5d2fSBarry Smith 78f1af5d2fSBarry Smith /* forward solve the U^T */ 79f1af5d2fSBarry Smith for (i=0; i<n; i++) { 80f1af5d2fSBarry Smith 81f1af5d2fSBarry Smith v = aa + diag[i]; 82f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 83ef66eb69SBarry Smith s1 = (*v++)*x[i]; 84f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 85f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 86f1af5d2fSBarry Smith while (nz--) { 87f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 88f1af5d2fSBarry Smith } 89f1af5d2fSBarry Smith x[i] = s1; 90f1af5d2fSBarry Smith } 91f1af5d2fSBarry Smith /* backward solve the L^T */ 92f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 93f1af5d2fSBarry Smith v = aa + diag[i] - 1; 94f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 95f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 96f1af5d2fSBarry Smith s1 = x[i]; 97f1af5d2fSBarry Smith while (nz--) { 98f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 99f1af5d2fSBarry Smith } 100f1af5d2fSBarry Smith } 1011ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 102dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 103f1af5d2fSBarry Smith PetscFunctionReturn(0); 104f1af5d2fSBarry Smith } 105f1af5d2fSBarry Smith 1064a2ae208SSatish Balay #undef __FUNCT__ 10706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 10806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 109f1af5d2fSBarry Smith { 110f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 111dfbe8321SBarry Smith PetscErrorCode ierr; 112b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 113b3260449SShri Abhyankar const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 114b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 115b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x; 116f1af5d2fSBarry Smith 117f1af5d2fSBarry Smith PetscFunctionBegin; 118ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1191ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 120f1af5d2fSBarry Smith 121f1af5d2fSBarry Smith /* forward solve the U^T */ 122f1af5d2fSBarry Smith idx = 0; 123f1af5d2fSBarry Smith for (i=0; i<n; i++) { 124f1af5d2fSBarry Smith 125f1af5d2fSBarry Smith v = aa + 4*diag[i]; 126f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 127ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 128f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 129f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 130f1af5d2fSBarry Smith v += 4; 131f1af5d2fSBarry Smith 132f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 133f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 134f1af5d2fSBarry Smith while (nz--) { 135f1af5d2fSBarry Smith oidx = 2*(*vi++); 136f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 137f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 138f1af5d2fSBarry Smith v += 4; 139f1af5d2fSBarry Smith } 140f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 141f1af5d2fSBarry Smith idx += 2; 142f1af5d2fSBarry Smith } 143f1af5d2fSBarry Smith /* backward solve the L^T */ 144f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 145f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 146f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 147f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 148f1af5d2fSBarry Smith idt = 2*i; 149f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 150f1af5d2fSBarry Smith while (nz--) { 151f1af5d2fSBarry Smith idx = 2*(*vi--); 152f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 153f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 154f1af5d2fSBarry Smith v -= 4; 155f1af5d2fSBarry Smith } 156f1af5d2fSBarry Smith } 1571ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 158dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 159f1af5d2fSBarry Smith PetscFunctionReturn(0); 160f1af5d2fSBarry Smith } 161f1af5d2fSBarry Smith 1624a2ae208SSatish Balay #undef __FUNCT__ 1634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 1644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 1656929473cSShri Abhyankar { 1666929473cSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1676929473cSShri Abhyankar PetscErrorCode ierr; 168b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1696929473cSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 170b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 171b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 172b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x; 1736929473cSShri Abhyankar 1746929473cSShri Abhyankar PetscFunctionBegin; 1756929473cSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1766929473cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1776929473cSShri Abhyankar 1786929473cSShri Abhyankar /* forward solve the U^T */ 1796929473cSShri Abhyankar idx = 0; 1806929473cSShri Abhyankar for (i=0; i<n; i++) { 1816929473cSShri Abhyankar v = aa + bs2*diag[i]; 1826929473cSShri Abhyankar /* multiply by the inverse of the block diagonal */ 1836929473cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 1846929473cSShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 1856929473cSShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 1866929473cSShri Abhyankar v -= bs2; 1876929473cSShri Abhyankar 1886929473cSShri Abhyankar vi = aj + diag[i] - 1; 1896929473cSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 1906929473cSShri Abhyankar for (j=0;j>-nz;j--) { 1916929473cSShri Abhyankar oidx = bs*vi[j]; 1926929473cSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2; 1936929473cSShri Abhyankar x[oidx+1] -= v[2]*s1 + v[3]*s2; 1946929473cSShri Abhyankar v -= bs2; 1956929473cSShri Abhyankar } 1966929473cSShri Abhyankar x[idx] = s1;x[1+idx] = s2; 1976929473cSShri Abhyankar idx += bs; 1986929473cSShri Abhyankar } 1996929473cSShri Abhyankar /* backward solve the L^T */ 2006929473cSShri Abhyankar for (i=n-1; i>=0; i--) { 2016929473cSShri Abhyankar v = aa + bs2*ai[i]; 2026929473cSShri Abhyankar vi = aj + ai[i]; 2036929473cSShri Abhyankar nz = ai[i+1] - ai[i]; 2046929473cSShri Abhyankar idt = bs*i; 2056929473cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 2066929473cSShri Abhyankar for (j=0;j<nz;j++) { 2076929473cSShri Abhyankar idx = bs*vi[j]; 2086929473cSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2; 2096929473cSShri Abhyankar x[idx+1] -= v[2]*s1 + v[3]*s2; 2106929473cSShri Abhyankar v += bs2; 2116929473cSShri Abhyankar } 2126929473cSShri Abhyankar } 2136929473cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2146929473cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2156929473cSShri Abhyankar PetscFunctionReturn(0); 2166929473cSShri Abhyankar } 2176929473cSShri Abhyankar 2186929473cSShri Abhyankar #undef __FUNCT__ 21906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 22006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 221f1af5d2fSBarry Smith { 222f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 223dfbe8321SBarry Smith PetscErrorCode ierr; 224b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 225b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 226b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 227b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x; 228f1af5d2fSBarry Smith 229f1af5d2fSBarry Smith PetscFunctionBegin; 230ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2311ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 232f1af5d2fSBarry Smith 233f1af5d2fSBarry Smith /* forward solve the U^T */ 234f1af5d2fSBarry Smith idx = 0; 235f1af5d2fSBarry Smith for (i=0; i<n; i++) { 236f1af5d2fSBarry Smith 237f1af5d2fSBarry Smith v = aa + 9*diag[i]; 238f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 239ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 240f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 241f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 242f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 243f1af5d2fSBarry Smith v += 9; 244f1af5d2fSBarry Smith 245f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 246f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 247f1af5d2fSBarry Smith while (nz--) { 248f1af5d2fSBarry Smith oidx = 3*(*vi++); 249f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 250f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 251f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 252f1af5d2fSBarry Smith v += 9; 253f1af5d2fSBarry Smith } 254f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 255f1af5d2fSBarry Smith idx += 3; 256f1af5d2fSBarry Smith } 257f1af5d2fSBarry Smith /* backward solve the L^T */ 258f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 259f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 260f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 261f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 262f1af5d2fSBarry Smith idt = 3*i; 263f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 264f1af5d2fSBarry Smith while (nz--) { 265f1af5d2fSBarry Smith idx = 3*(*vi--); 266f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 267f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 268f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 269f1af5d2fSBarry Smith v -= 9; 270f1af5d2fSBarry Smith } 271f1af5d2fSBarry Smith } 2721ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 273dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 274f1af5d2fSBarry Smith PetscFunctionReturn(0); 275f1af5d2fSBarry Smith } 276f1af5d2fSBarry Smith 2774a2ae208SSatish Balay #undef __FUNCT__ 2784dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 2794dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2808499736aSShri Abhyankar { 2818499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2828499736aSShri Abhyankar PetscErrorCode ierr; 283b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 2848499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 285b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 286b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 287b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x; 2888499736aSShri Abhyankar 2898499736aSShri Abhyankar PetscFunctionBegin; 2908499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2918499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2928499736aSShri Abhyankar 2938499736aSShri Abhyankar /* forward solve the U^T */ 2948499736aSShri Abhyankar idx = 0; 2958499736aSShri Abhyankar for (i=0; i<n; i++) { 2968499736aSShri Abhyankar v = aa + bs2*diag[i]; 2978499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 2988499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2998499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 3008499736aSShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 3018499736aSShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 3028499736aSShri Abhyankar v -= bs2; 3038499736aSShri Abhyankar 3048499736aSShri Abhyankar vi = aj + diag[i] - 1; 3058499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 3068499736aSShri Abhyankar for (j=0;j>-nz;j--) { 3078499736aSShri Abhyankar oidx = bs*vi[j]; 3088499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 3098499736aSShri Abhyankar x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 3108499736aSShri Abhyankar x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 3118499736aSShri Abhyankar v -= bs2; 3128499736aSShri Abhyankar } 3138499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 3148499736aSShri Abhyankar idx += bs; 3158499736aSShri Abhyankar } 3168499736aSShri Abhyankar /* backward solve the L^T */ 3178499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 3188499736aSShri Abhyankar v = aa + bs2*ai[i]; 3198499736aSShri Abhyankar vi = aj + ai[i]; 3208499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 3218499736aSShri Abhyankar idt = bs*i; 3228499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 3238499736aSShri Abhyankar for (j=0;j<nz;j++) { 3248499736aSShri Abhyankar idx = bs*vi[j]; 3258499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 3268499736aSShri Abhyankar x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 3278499736aSShri Abhyankar x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 3288499736aSShri Abhyankar v += bs2; 3298499736aSShri Abhyankar } 3308499736aSShri Abhyankar } 3318499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3328499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3338499736aSShri Abhyankar PetscFunctionReturn(0); 3348499736aSShri Abhyankar } 3358499736aSShri Abhyankar 3368499736aSShri Abhyankar #undef __FUNCT__ 33706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 33806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 339f1af5d2fSBarry Smith { 340f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 341dfbe8321SBarry Smith PetscErrorCode ierr; 342b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 343b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 344b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 345b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 346f1af5d2fSBarry Smith 347f1af5d2fSBarry Smith PetscFunctionBegin; 348ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3491ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 350f1af5d2fSBarry Smith 351f1af5d2fSBarry Smith /* forward solve the U^T */ 352f1af5d2fSBarry Smith idx = 0; 353f1af5d2fSBarry Smith for (i=0; i<n; i++) { 354f1af5d2fSBarry Smith 355f1af5d2fSBarry Smith v = aa + 16*diag[i]; 356f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 357ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 358f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 359f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 360f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 361f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 362f1af5d2fSBarry Smith v += 16; 363f1af5d2fSBarry Smith 364f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 365f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 366f1af5d2fSBarry Smith while (nz--) { 367f1af5d2fSBarry Smith oidx = 4*(*vi++); 368f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 369f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 370f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 371f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 372f1af5d2fSBarry Smith v += 16; 373f1af5d2fSBarry Smith } 374f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 375f1af5d2fSBarry Smith idx += 4; 376f1af5d2fSBarry Smith } 377f1af5d2fSBarry Smith /* backward solve the L^T */ 378f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 379f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 380f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 381f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 382f1af5d2fSBarry Smith idt = 4*i; 383f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 384f1af5d2fSBarry Smith while (nz--) { 385f1af5d2fSBarry Smith idx = 4*(*vi--); 386f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 387f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 388f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 389f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 390f1af5d2fSBarry Smith v -= 16; 391f1af5d2fSBarry Smith } 392f1af5d2fSBarry Smith } 3931ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 394dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 395f1af5d2fSBarry Smith PetscFunctionReturn(0); 396f1af5d2fSBarry Smith } 397f1af5d2fSBarry Smith 3984a2ae208SSatish Balay #undef __FUNCT__ 3994dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 4004dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4018499736aSShri Abhyankar { 4028499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4038499736aSShri Abhyankar PetscErrorCode ierr; 404b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 4058499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 406b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 407b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 408b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 4098499736aSShri Abhyankar 4108499736aSShri Abhyankar PetscFunctionBegin; 4118499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4128499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4138499736aSShri Abhyankar 4148499736aSShri Abhyankar /* forward solve the U^T */ 4158499736aSShri Abhyankar idx = 0; 4168499736aSShri Abhyankar for (i=0; i<n; i++) { 4178499736aSShri Abhyankar v = aa + bs2*diag[i]; 4188499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 4198499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 4208499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 4218499736aSShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 4228499736aSShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 4238499736aSShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 4248499736aSShri Abhyankar v -= bs2; 4258499736aSShri Abhyankar 4268499736aSShri Abhyankar vi = aj + diag[i] - 1; 4278499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 4288499736aSShri Abhyankar for (j=0;j>-nz;j--) { 4298499736aSShri Abhyankar oidx = bs*vi[j]; 4308499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4318499736aSShri Abhyankar x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4328499736aSShri Abhyankar x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4338499736aSShri Abhyankar x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4348499736aSShri Abhyankar v -= bs2; 4358499736aSShri Abhyankar } 4368499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 4378499736aSShri Abhyankar idx += bs; 4388499736aSShri Abhyankar } 4398499736aSShri Abhyankar /* backward solve the L^T */ 4408499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 4418499736aSShri Abhyankar v = aa + bs2*ai[i]; 4428499736aSShri Abhyankar vi = aj + ai[i]; 4438499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 4448499736aSShri Abhyankar idt = bs*i; 4458499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 4468499736aSShri Abhyankar for (j=0;j<nz;j++) { 4478499736aSShri Abhyankar idx = bs*vi[j]; 4488499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4498499736aSShri Abhyankar x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4508499736aSShri Abhyankar x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4518499736aSShri Abhyankar x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4528499736aSShri Abhyankar v += bs2; 4538499736aSShri Abhyankar } 4548499736aSShri Abhyankar } 4558499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4568499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4578499736aSShri Abhyankar PetscFunctionReturn(0); 4588499736aSShri Abhyankar } 4598499736aSShri Abhyankar 4608499736aSShri Abhyankar #undef __FUNCT__ 46106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 46206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 463f1af5d2fSBarry Smith { 464f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 465dfbe8321SBarry Smith PetscErrorCode ierr; 466b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 467b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 468b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 469b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 470f1af5d2fSBarry Smith 471f1af5d2fSBarry Smith PetscFunctionBegin; 472ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 474f1af5d2fSBarry Smith 475f1af5d2fSBarry Smith /* forward solve the U^T */ 476f1af5d2fSBarry Smith idx = 0; 477f1af5d2fSBarry Smith for (i=0; i<n; i++) { 478f1af5d2fSBarry Smith 479f1af5d2fSBarry Smith v = aa + 25*diag[i]; 480f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 481ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 482f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 483f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 484f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 485f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 486f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 487f1af5d2fSBarry Smith v += 25; 488f1af5d2fSBarry Smith 489f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 490f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 491f1af5d2fSBarry Smith while (nz--) { 492f1af5d2fSBarry Smith oidx = 5*(*vi++); 493f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 494f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 495f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 496f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 497f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 498f1af5d2fSBarry Smith v += 25; 499f1af5d2fSBarry Smith } 500f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 501f1af5d2fSBarry Smith idx += 5; 502f1af5d2fSBarry Smith } 503f1af5d2fSBarry Smith /* backward solve the L^T */ 504f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 505f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 506f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 507f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 508f1af5d2fSBarry Smith idt = 5*i; 509f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 510f1af5d2fSBarry Smith while (nz--) { 511f1af5d2fSBarry Smith idx = 5*(*vi--); 512f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 513f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 514f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 515f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 516f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 517f1af5d2fSBarry Smith v -= 25; 518f1af5d2fSBarry Smith } 519f1af5d2fSBarry Smith } 5201ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 521dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 522f1af5d2fSBarry Smith PetscFunctionReturn(0); 523f1af5d2fSBarry Smith } 524f1af5d2fSBarry Smith 5254a2ae208SSatish Balay #undef __FUNCT__ 5264dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 5274dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 5288499736aSShri Abhyankar { 5298499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5308499736aSShri Abhyankar PetscErrorCode ierr; 531b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5328499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 533b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 534b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 535b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 5368499736aSShri Abhyankar 5378499736aSShri Abhyankar PetscFunctionBegin; 5388499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 5398499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5408499736aSShri Abhyankar 5418499736aSShri Abhyankar /* forward solve the U^T */ 5428499736aSShri Abhyankar idx = 0; 5438499736aSShri Abhyankar for (i=0; i<n; i++) { 5448499736aSShri Abhyankar v = aa + bs2*diag[i]; 5458499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 5468499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 5478499736aSShri Abhyankar x5 = x[4+idx]; 5488499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 5498499736aSShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 5508499736aSShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 5518499736aSShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 5528499736aSShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 5538499736aSShri Abhyankar v -= bs2; 5548499736aSShri Abhyankar 5558499736aSShri Abhyankar vi = aj + diag[i] - 1; 5568499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 5578499736aSShri Abhyankar for (j=0;j>-nz;j--) { 5588499736aSShri Abhyankar oidx = bs*vi[j]; 5598499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 5608499736aSShri Abhyankar x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 5618499736aSShri Abhyankar x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 5628499736aSShri Abhyankar x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 5638499736aSShri Abhyankar x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 5648499736aSShri Abhyankar v -= bs2; 5658499736aSShri Abhyankar } 5668499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 5678499736aSShri Abhyankar idx += bs; 5688499736aSShri Abhyankar } 5698499736aSShri Abhyankar /* backward solve the L^T */ 5708499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 5718499736aSShri Abhyankar v = aa + bs2*ai[i]; 5728499736aSShri Abhyankar vi = aj + ai[i]; 5738499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 5748499736aSShri Abhyankar idt = bs*i; 5758499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 5768499736aSShri Abhyankar for (j=0;j<nz;j++) { 5778499736aSShri Abhyankar idx = bs*vi[j]; 5788499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 5798499736aSShri Abhyankar x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 5808499736aSShri Abhyankar x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 5818499736aSShri Abhyankar x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 5828499736aSShri Abhyankar x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 5838499736aSShri Abhyankar v += bs2; 5848499736aSShri Abhyankar } 5858499736aSShri Abhyankar } 5868499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5878499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5888499736aSShri Abhyankar PetscFunctionReturn(0); 5898499736aSShri Abhyankar } 5908499736aSShri Abhyankar 5918499736aSShri Abhyankar #undef __FUNCT__ 59206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 59306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 594f1af5d2fSBarry Smith { 595f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 596dfbe8321SBarry Smith PetscErrorCode ierr; 597b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 598b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 599b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 600b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 601f1af5d2fSBarry Smith 602f1af5d2fSBarry Smith PetscFunctionBegin; 603ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 6041ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 605f1af5d2fSBarry Smith 606f1af5d2fSBarry Smith /* forward solve the U^T */ 607f1af5d2fSBarry Smith idx = 0; 608f1af5d2fSBarry Smith for (i=0; i<n; i++) { 609f1af5d2fSBarry Smith 610f1af5d2fSBarry Smith v = aa + 36*diag[i]; 611f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 612ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 613ef66eb69SBarry Smith x6 = x[5+idx]; 614f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 615f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 616f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 617f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 618f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 619f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 620f1af5d2fSBarry Smith v += 36; 621f1af5d2fSBarry Smith 622f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 623f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 624f1af5d2fSBarry Smith while (nz--) { 625f1af5d2fSBarry Smith oidx = 6*(*vi++); 626f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632f1af5d2fSBarry Smith v += 36; 633f1af5d2fSBarry Smith } 634f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 635f1af5d2fSBarry Smith x[5+idx] = s6; 636f1af5d2fSBarry Smith idx += 6; 637f1af5d2fSBarry Smith } 638f1af5d2fSBarry Smith /* backward solve the L^T */ 639f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 640f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 641f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 642f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 643f1af5d2fSBarry Smith idt = 6*i; 644f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 645f1af5d2fSBarry Smith s6 = x[5+idt]; 646f1af5d2fSBarry Smith while (nz--) { 647f1af5d2fSBarry Smith idx = 6*(*vi--); 648f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 649f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 650f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 651f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 652f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 653f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 654f1af5d2fSBarry Smith v -= 36; 655f1af5d2fSBarry Smith } 656f1af5d2fSBarry Smith } 6571ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 658dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 659f1af5d2fSBarry Smith PetscFunctionReturn(0); 660f1af5d2fSBarry Smith } 661f1af5d2fSBarry Smith 6624a2ae208SSatish Balay #undef __FUNCT__ 6634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 6644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 6658499736aSShri Abhyankar { 6668499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 6678499736aSShri Abhyankar PetscErrorCode ierr; 668b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 6698499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 670b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 671b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 672b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 6738499736aSShri Abhyankar 6748499736aSShri Abhyankar PetscFunctionBegin; 6758499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 6768499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 6778499736aSShri Abhyankar 6788499736aSShri Abhyankar /* forward solve the U^T */ 6798499736aSShri Abhyankar idx = 0; 6808499736aSShri Abhyankar for (i=0; i<n; i++) { 6818499736aSShri Abhyankar v = aa + bs2*diag[i]; 6828499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 6838499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 6848499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; 6858499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 6868499736aSShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 6878499736aSShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 6888499736aSShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 6898499736aSShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 6908499736aSShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 6918499736aSShri Abhyankar v -= bs2; 6928499736aSShri Abhyankar 6938499736aSShri Abhyankar vi = aj + diag[i] - 1; 6948499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 6958499736aSShri Abhyankar for (j=0;j>-nz;j--) { 6968499736aSShri Abhyankar oidx = bs*vi[j]; 6978499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 6988499736aSShri Abhyankar x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 6998499736aSShri Abhyankar x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 7008499736aSShri Abhyankar x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 7018499736aSShri Abhyankar x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 7028499736aSShri Abhyankar x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 7038499736aSShri Abhyankar v -= bs2; 7048499736aSShri Abhyankar } 7058499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 7068499736aSShri Abhyankar x[5+idx] = s6; 7078499736aSShri Abhyankar idx += bs; 7088499736aSShri Abhyankar } 7098499736aSShri Abhyankar /* backward solve the L^T */ 7108499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 7118499736aSShri Abhyankar v = aa + bs2*ai[i]; 7128499736aSShri Abhyankar vi = aj + ai[i]; 7138499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 7148499736aSShri Abhyankar idt = bs*i; 7158499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 7168499736aSShri Abhyankar s6 = x[5+idt]; 7178499736aSShri Abhyankar for (j=0;j<nz;j++) { 7188499736aSShri Abhyankar idx = bs*vi[j]; 7198499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 7208499736aSShri Abhyankar x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 7218499736aSShri Abhyankar x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 7228499736aSShri Abhyankar x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 7238499736aSShri Abhyankar x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 7248499736aSShri Abhyankar x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 7258499736aSShri Abhyankar v += bs2; 7268499736aSShri Abhyankar } 7278499736aSShri Abhyankar } 7288499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 7298499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 7308499736aSShri Abhyankar PetscFunctionReturn(0); 7318499736aSShri Abhyankar } 7328499736aSShri Abhyankar 7338499736aSShri Abhyankar #undef __FUNCT__ 73406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 73506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 736f1af5d2fSBarry Smith { 737f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 738dfbe8321SBarry Smith PetscErrorCode ierr; 739b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 740b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 741b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 742b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 743f1af5d2fSBarry Smith 744f1af5d2fSBarry Smith PetscFunctionBegin; 745ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 7461ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 747f1af5d2fSBarry Smith 748f1af5d2fSBarry Smith /* forward solve the U^T */ 749f1af5d2fSBarry Smith idx = 0; 750f1af5d2fSBarry Smith for (i=0; i<n; i++) { 751f1af5d2fSBarry Smith 752f1af5d2fSBarry Smith v = aa + 49*diag[i]; 753f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 754ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 755ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 756f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 757f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 758f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 759f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 760f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 761f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 762f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 763f1af5d2fSBarry Smith v += 49; 764f1af5d2fSBarry Smith 765f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 766f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 767f1af5d2fSBarry Smith while (nz--) { 768f1af5d2fSBarry Smith oidx = 7*(*vi++); 769f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 770f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 771f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 772f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 773f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 774f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 775f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 776f1af5d2fSBarry Smith v += 49; 777f1af5d2fSBarry Smith } 778f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 779f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 780f1af5d2fSBarry Smith idx += 7; 781f1af5d2fSBarry Smith } 782f1af5d2fSBarry Smith /* backward solve the L^T */ 783f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 784f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 785f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 786f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 787f1af5d2fSBarry Smith idt = 7*i; 788f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 789f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 790f1af5d2fSBarry Smith while (nz--) { 791f1af5d2fSBarry Smith idx = 7*(*vi--); 792f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 793f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 794f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 795f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 796f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 797f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 798f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 799f1af5d2fSBarry Smith v -= 49; 800f1af5d2fSBarry Smith } 801f1af5d2fSBarry Smith } 8021ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 803dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 804f1af5d2fSBarry Smith PetscFunctionReturn(0); 805f1af5d2fSBarry Smith } 8068499736aSShri Abhyankar #undef __FUNCT__ 8074dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 8084dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 8098499736aSShri Abhyankar { 8108499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 8118499736aSShri Abhyankar PetscErrorCode ierr; 812b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 8138499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 814b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 815b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 816b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 8178499736aSShri Abhyankar 8188499736aSShri Abhyankar PetscFunctionBegin; 8198499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 8208499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 8218499736aSShri Abhyankar 8228499736aSShri Abhyankar /* forward solve the U^T */ 8238499736aSShri Abhyankar idx = 0; 8248499736aSShri Abhyankar for (i=0; i<n; i++) { 8258499736aSShri Abhyankar v = aa + bs2*diag[i]; 8268499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 8278499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 8288499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 8298499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 8308499736aSShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 8318499736aSShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 8328499736aSShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 8338499736aSShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 8348499736aSShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 8358499736aSShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 8368499736aSShri Abhyankar v -= bs2; 8378499736aSShri Abhyankar vi = aj + diag[i] - 1; 8388499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 8398499736aSShri Abhyankar for (j=0;j>-nz;j--) { 8408499736aSShri Abhyankar oidx = bs*vi[j]; 8418499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 8428499736aSShri Abhyankar x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 8438499736aSShri Abhyankar x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 8448499736aSShri Abhyankar x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 8458499736aSShri Abhyankar x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 8468499736aSShri Abhyankar x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 8478499736aSShri Abhyankar x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 8488499736aSShri Abhyankar v -= bs2; 8498499736aSShri Abhyankar } 8508499736aSShri Abhyankar x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 8518499736aSShri Abhyankar x[5+idx] = s6; x[6+idx] = s7; 8528499736aSShri Abhyankar idx += bs; 8538499736aSShri Abhyankar } 8548499736aSShri Abhyankar /* backward solve the L^T */ 8558499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 8568499736aSShri Abhyankar v = aa + bs2*ai[i]; 8578499736aSShri Abhyankar vi = aj + ai[i]; 8588499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 8598499736aSShri Abhyankar idt = bs*i; 8608499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 8618499736aSShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; 8628499736aSShri Abhyankar for (j=0;j<nz;j++) { 8638499736aSShri Abhyankar idx = bs*vi[j]; 8648499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 8658499736aSShri Abhyankar x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 8668499736aSShri Abhyankar x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 8678499736aSShri Abhyankar x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 8688499736aSShri Abhyankar x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 8698499736aSShri Abhyankar x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 8708499736aSShri Abhyankar x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 8718499736aSShri Abhyankar v += bs2; 8728499736aSShri Abhyankar } 8738499736aSShri Abhyankar } 8748499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 8758499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 8768499736aSShri Abhyankar PetscFunctionReturn(0); 8778499736aSShri Abhyankar } 878f1af5d2fSBarry Smith 879f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 8804a2ae208SSatish Balay #undef __FUNCT__ 88193fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 88293fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 88393fd935bSShri Abhyankar { 88493fd935bSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 88593fd935bSShri Abhyankar IS iscol = a->col,isrow = a->row; 88693fd935bSShri Abhyankar PetscErrorCode ierr; 88793fd935bSShri Abhyankar const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 88893fd935bSShri Abhyankar PetscInt i,n = a->mbs,j; 88993fd935bSShri Abhyankar PetscInt nz; 89093fd935bSShri Abhyankar PetscScalar *x,*tmp,s1; 89193fd935bSShri Abhyankar const MatScalar *aa = a->a,*v; 89293fd935bSShri Abhyankar const PetscScalar *b; 89393fd935bSShri Abhyankar 89493fd935bSShri Abhyankar PetscFunctionBegin; 8953649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 89693fd935bSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 89793fd935bSShri Abhyankar tmp = a->solve_work; 89893fd935bSShri Abhyankar 89993fd935bSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 90093fd935bSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 90193fd935bSShri Abhyankar 90293fd935bSShri Abhyankar /* copy the b into temp work space according to permutation */ 90393fd935bSShri Abhyankar for (i=0; i<n; i++) tmp[i] = b[c[i]]; 90493fd935bSShri Abhyankar 90593fd935bSShri Abhyankar /* forward solve the U^T */ 90693fd935bSShri Abhyankar for (i=0; i<n; i++) { 90793fd935bSShri Abhyankar v = aa + adiag[i+1] + 1; 90893fd935bSShri Abhyankar vi = aj + adiag[i+1] + 1; 90993fd935bSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 91093fd935bSShri Abhyankar s1 = tmp[i]; 91193fd935bSShri Abhyankar s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 91293fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 91393fd935bSShri Abhyankar tmp[i] = s1; 91493fd935bSShri Abhyankar } 91593fd935bSShri Abhyankar 91693fd935bSShri Abhyankar /* backward solve the L^T */ 91793fd935bSShri Abhyankar for (i=n-1; i>=0; i--) { 91893fd935bSShri Abhyankar v = aa + ai[i]; 91993fd935bSShri Abhyankar vi = aj + ai[i]; 92093fd935bSShri Abhyankar nz = ai[i+1] - ai[i]; 92193fd935bSShri Abhyankar s1 = tmp[i]; 92293fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 92393fd935bSShri Abhyankar } 92493fd935bSShri Abhyankar 92593fd935bSShri Abhyankar /* copy tmp into x according to permutation */ 92693fd935bSShri Abhyankar for (i=0; i<n; i++) x[r[i]] = tmp[i]; 92793fd935bSShri Abhyankar 92893fd935bSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 92993fd935bSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9303649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 93193fd935bSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 93293fd935bSShri Abhyankar 93393fd935bSShri Abhyankar ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 93493fd935bSShri Abhyankar PetscFunctionReturn(0); 93593fd935bSShri Abhyankar } 93693fd935bSShri Abhyankar 93793fd935bSShri Abhyankar #undef __FUNCT__ 93806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 93906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 940f1af5d2fSBarry Smith { 941f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 942f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9436849ba73SBarry Smith PetscErrorCode ierr; 9445d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 945b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 946b3260449SShri Abhyankar PetscInt i,nz; 947b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 948b3260449SShri Abhyankar PetscScalar s1,*x,*t; 949b3260449SShri Abhyankar const PetscScalar *b; 950f1af5d2fSBarry Smith 951f1af5d2fSBarry Smith PetscFunctionBegin; 9523649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 9531ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 954f1af5d2fSBarry Smith t = a->solve_work; 955f1af5d2fSBarry Smith 956f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 957f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 958f1af5d2fSBarry Smith 959f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 960f1af5d2fSBarry Smith for (i=0; i<n; i++) { 961f1af5d2fSBarry Smith t[i] = b[c[i]]; 962f1af5d2fSBarry Smith } 963f1af5d2fSBarry Smith 964f1af5d2fSBarry Smith /* forward solve the U^T */ 965f1af5d2fSBarry Smith for (i=0; i<n; i++) { 966f1af5d2fSBarry Smith 967f1af5d2fSBarry Smith v = aa + diag[i]; 968f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 969f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 970f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 971f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 972f1af5d2fSBarry Smith while (nz--) { 973f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 974f1af5d2fSBarry Smith } 975f1af5d2fSBarry Smith t[i] = s1; 976f1af5d2fSBarry Smith } 977f1af5d2fSBarry Smith /* backward solve the L^T */ 978f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 979f1af5d2fSBarry Smith v = aa + diag[i] - 1; 980f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 981f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 982f1af5d2fSBarry Smith s1 = t[i]; 983f1af5d2fSBarry Smith while (nz--) { 984f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 985f1af5d2fSBarry Smith } 986f1af5d2fSBarry Smith } 987f1af5d2fSBarry Smith 988f1af5d2fSBarry Smith /* copy t into x according to permutation */ 989f1af5d2fSBarry Smith for (i=0; i<n; i++) { 990f1af5d2fSBarry Smith x[r[i]] = t[i]; 991f1af5d2fSBarry Smith } 992f1af5d2fSBarry Smith 993f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 994f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9953649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 9961ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 997dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 998f1af5d2fSBarry Smith PetscFunctionReturn(0); 999f1af5d2fSBarry Smith } 1000f1af5d2fSBarry Smith 10014a2ae208SSatish Balay #undef __FUNCT__ 100206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 100306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1004f1af5d2fSBarry Smith { 1005f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1006f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10076849ba73SBarry Smith PetscErrorCode ierr; 10085d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1009b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1010b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1011b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1012b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x,*t; 1013b3260449SShri Abhyankar const PetscScalar *b; 1014f1af5d2fSBarry Smith 1015f1af5d2fSBarry Smith PetscFunctionBegin; 10163649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 10171ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1018f1af5d2fSBarry Smith t = a->solve_work; 1019f1af5d2fSBarry Smith 1020f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1021f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1022f1af5d2fSBarry Smith 1023f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1024f1af5d2fSBarry Smith ii = 0; 1025f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1026f1af5d2fSBarry Smith ic = 2*c[i]; 1027f1af5d2fSBarry Smith t[ii] = b[ic]; 1028f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1029f1af5d2fSBarry Smith ii += 2; 1030f1af5d2fSBarry Smith } 1031f1af5d2fSBarry Smith 1032f1af5d2fSBarry Smith /* forward solve the U^T */ 1033f1af5d2fSBarry Smith idx = 0; 1034f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1035f1af5d2fSBarry Smith 1036f1af5d2fSBarry Smith v = aa + 4*diag[i]; 1037f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1038f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1039f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 1040f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 1041f1af5d2fSBarry Smith v += 4; 1042f1af5d2fSBarry Smith 1043f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1044f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1045f1af5d2fSBarry Smith while (nz--) { 1046f1af5d2fSBarry Smith oidx = 2*(*vi++); 1047f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 1048f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 1049f1af5d2fSBarry Smith v += 4; 1050f1af5d2fSBarry Smith } 1051f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1052f1af5d2fSBarry Smith idx += 2; 1053f1af5d2fSBarry Smith } 1054f1af5d2fSBarry Smith /* backward solve the L^T */ 1055f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 1056f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 1057f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1058f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1059f1af5d2fSBarry Smith idt = 2*i; 1060f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1061f1af5d2fSBarry Smith while (nz--) { 1062f1af5d2fSBarry Smith idx = 2*(*vi--); 1063f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 1064f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 1065f1af5d2fSBarry Smith v -= 4; 1066f1af5d2fSBarry Smith } 1067f1af5d2fSBarry Smith } 1068f1af5d2fSBarry Smith 1069f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1070f1af5d2fSBarry Smith ii = 0; 1071f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1072f1af5d2fSBarry Smith ir = 2*r[i]; 1073f1af5d2fSBarry Smith x[ir] = t[ii]; 1074f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1075f1af5d2fSBarry Smith ii += 2; 1076f1af5d2fSBarry Smith } 1077f1af5d2fSBarry Smith 1078f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1079f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 10803649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 10811ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1082dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1083f1af5d2fSBarry Smith PetscFunctionReturn(0); 1084f1af5d2fSBarry Smith } 1085f1af5d2fSBarry Smith 10864a2ae208SSatish Balay #undef __FUNCT__ 10874dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 10884dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 108932121132SShri Abhyankar { 109032121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 109132121132SShri Abhyankar PetscErrorCode ierr; 109232121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1093b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 109432121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 109532121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1096b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1097b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1098b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x,*t; 1099b3260449SShri Abhyankar const PetscScalar *b; 110032121132SShri Abhyankar 110132121132SShri Abhyankar PetscFunctionBegin; 11023649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 110332121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 110432121132SShri Abhyankar t = a->solve_work; 110532121132SShri Abhyankar 110632121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 110732121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 110832121132SShri Abhyankar 110932121132SShri Abhyankar /* copy b into temp work space according to permutation */ 111032121132SShri Abhyankar for (i=0;i<n;i++) { 111132121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 111232121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; 111332121132SShri Abhyankar } 111432121132SShri Abhyankar 111532121132SShri Abhyankar /* forward solve the U^T */ 111632121132SShri Abhyankar idx = 0; 111732121132SShri Abhyankar for (i=0; i<n; i++) { 111832121132SShri Abhyankar v = aa + bs2*diag[i]; 111932121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 112032121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 112132121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 112232121132SShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 112332121132SShri Abhyankar v -= bs2; 112432121132SShri Abhyankar 112532121132SShri Abhyankar vi = aj + diag[i] - 1; 112632121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 112732121132SShri Abhyankar for (j=0;j>-nz;j--) { 112832121132SShri Abhyankar oidx = bs*vi[j]; 112932121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2; 113032121132SShri Abhyankar t[oidx+1] -= v[2]*s1 + v[3]*s2; 113132121132SShri Abhyankar v -= bs2; 113232121132SShri Abhyankar } 113332121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 113432121132SShri Abhyankar idx += bs; 113532121132SShri Abhyankar } 113632121132SShri Abhyankar /* backward solve the L^T */ 113732121132SShri Abhyankar for (i=n-1; i>=0; i--) { 113832121132SShri Abhyankar v = aa + bs2*ai[i]; 113932121132SShri Abhyankar vi = aj + ai[i]; 114032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 114132121132SShri Abhyankar idt = bs*i; 114232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 114332121132SShri Abhyankar for (j=0;j<nz;j++) { 114432121132SShri Abhyankar idx = bs*vi[j]; 114532121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2; 114632121132SShri Abhyankar t[idx+1] -= v[2]*s1 + v[3]*s2; 114732121132SShri Abhyankar v += bs2; 114832121132SShri Abhyankar } 114932121132SShri Abhyankar } 115032121132SShri Abhyankar 115132121132SShri Abhyankar /* copy t into x according to permutation */ 115232121132SShri Abhyankar for (i=0;i<n;i++) { 115332121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 115432121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; 115532121132SShri Abhyankar } 115632121132SShri Abhyankar 115732121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 115832121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11593649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 116032121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 116132121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 116232121132SShri Abhyankar PetscFunctionReturn(0); 116332121132SShri Abhyankar } 116432121132SShri Abhyankar 116532121132SShri Abhyankar #undef __FUNCT__ 116606e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 116706e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1168f1af5d2fSBarry Smith { 1169f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1170f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 11716849ba73SBarry Smith PetscErrorCode ierr; 11725d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1173b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1174b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1175b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1176b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1177b3260449SShri Abhyankar const PetscScalar *b; 1178f1af5d2fSBarry Smith 1179f1af5d2fSBarry Smith PetscFunctionBegin; 11803649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 11811ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1182f1af5d2fSBarry Smith t = a->solve_work; 1183f1af5d2fSBarry Smith 1184f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1185f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1186f1af5d2fSBarry Smith 1187f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1188f1af5d2fSBarry Smith ii = 0; 1189f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1190f1af5d2fSBarry Smith ic = 3*c[i]; 1191f1af5d2fSBarry Smith t[ii] = b[ic]; 1192f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1193f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1194f1af5d2fSBarry Smith ii += 3; 1195f1af5d2fSBarry Smith } 1196f1af5d2fSBarry Smith 1197f1af5d2fSBarry Smith /* forward solve the U^T */ 1198f1af5d2fSBarry Smith idx = 0; 1199f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1200f1af5d2fSBarry Smith 1201f1af5d2fSBarry Smith v = aa + 9*diag[i]; 1202f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1203f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1204f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1205f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1206f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1207f1af5d2fSBarry Smith v += 9; 1208f1af5d2fSBarry Smith 1209f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1210f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1211f1af5d2fSBarry Smith while (nz--) { 1212f1af5d2fSBarry Smith oidx = 3*(*vi++); 1213f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1214f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1215f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1216f1af5d2fSBarry Smith v += 9; 1217f1af5d2fSBarry Smith } 1218f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1219f1af5d2fSBarry Smith idx += 3; 1220f1af5d2fSBarry Smith } 1221f1af5d2fSBarry Smith /* backward solve the L^T */ 1222f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 1223f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 1224f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1225f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1226f1af5d2fSBarry Smith idt = 3*i; 1227f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1228f1af5d2fSBarry Smith while (nz--) { 1229f1af5d2fSBarry Smith idx = 3*(*vi--); 1230f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1231f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1232f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1233f1af5d2fSBarry Smith v -= 9; 1234f1af5d2fSBarry Smith } 1235f1af5d2fSBarry Smith } 1236f1af5d2fSBarry Smith 1237f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1238f1af5d2fSBarry Smith ii = 0; 1239f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1240f1af5d2fSBarry Smith ir = 3*r[i]; 1241f1af5d2fSBarry Smith x[ir] = t[ii]; 1242f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1243f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1244f1af5d2fSBarry Smith ii += 3; 1245f1af5d2fSBarry Smith } 1246f1af5d2fSBarry Smith 1247f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1248f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12493649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 12501ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1251dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1252f1af5d2fSBarry Smith PetscFunctionReturn(0); 1253f1af5d2fSBarry Smith } 1254f1af5d2fSBarry Smith 12554a2ae208SSatish Balay #undef __FUNCT__ 12564dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 12574dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 125832121132SShri Abhyankar { 125932121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 126032121132SShri Abhyankar PetscErrorCode ierr; 126132121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1262b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 126332121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 126432121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1265b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1266b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1267b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1268b3260449SShri Abhyankar const PetscScalar *b; 126932121132SShri Abhyankar 127032121132SShri Abhyankar PetscFunctionBegin; 12713649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 127232121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 127332121132SShri Abhyankar t = a->solve_work; 127432121132SShri Abhyankar 127532121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 127632121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 127732121132SShri Abhyankar 127832121132SShri Abhyankar /* copy b into temp work space according to permutation */ 127932121132SShri Abhyankar for (i=0;i<n;i++) { 128032121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 128132121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 128232121132SShri Abhyankar } 128332121132SShri Abhyankar 128432121132SShri Abhyankar /* forward solve the U^T */ 128532121132SShri Abhyankar idx = 0; 128632121132SShri Abhyankar for (i=0; i<n; i++) { 128732121132SShri Abhyankar v = aa + bs2*diag[i]; 128832121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 128932121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 129032121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 129132121132SShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 129232121132SShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 129332121132SShri Abhyankar v -= bs2; 129432121132SShri Abhyankar 129532121132SShri Abhyankar vi = aj + diag[i] - 1; 129632121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 129732121132SShri Abhyankar for (j=0;j>-nz;j--) { 129832121132SShri Abhyankar oidx = bs*vi[j]; 129932121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 130032121132SShri Abhyankar t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 130132121132SShri Abhyankar t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 130232121132SShri Abhyankar v -= bs2; 130332121132SShri Abhyankar } 130432121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 130532121132SShri Abhyankar idx += bs; 130632121132SShri Abhyankar } 130732121132SShri Abhyankar /* backward solve the L^T */ 130832121132SShri Abhyankar for (i=n-1; i>=0; i--) { 130932121132SShri Abhyankar v = aa + bs2*ai[i]; 131032121132SShri Abhyankar vi = aj + ai[i]; 131132121132SShri Abhyankar nz = ai[i+1] - ai[i]; 131232121132SShri Abhyankar idt = bs*i; 131332121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 131432121132SShri Abhyankar for (j=0;j<nz;j++) { 131532121132SShri Abhyankar idx = bs*vi[j]; 131632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 131732121132SShri Abhyankar t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 131832121132SShri Abhyankar t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 131932121132SShri Abhyankar v += bs2; 132032121132SShri Abhyankar } 132132121132SShri Abhyankar } 132232121132SShri Abhyankar 132332121132SShri Abhyankar /* copy t into x according to permutation */ 132432121132SShri Abhyankar for (i=0;i<n;i++) { 132532121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 132632121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 132732121132SShri Abhyankar } 132832121132SShri Abhyankar 132932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 133032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 13313649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 133232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 133332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 133432121132SShri Abhyankar PetscFunctionReturn(0); 133532121132SShri Abhyankar } 133632121132SShri Abhyankar 133732121132SShri Abhyankar #undef __FUNCT__ 133806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 133906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1340f1af5d2fSBarry Smith { 1341f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1342f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 13436849ba73SBarry Smith PetscErrorCode ierr; 13445d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1345b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1346b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1347b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1348b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1349b3260449SShri Abhyankar const PetscScalar *b; 1350f1af5d2fSBarry Smith 1351f1af5d2fSBarry Smith PetscFunctionBegin; 13523649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 13531ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1354f1af5d2fSBarry Smith t = a->solve_work; 1355f1af5d2fSBarry Smith 1356f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1357f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1358f1af5d2fSBarry Smith 1359f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1360f1af5d2fSBarry Smith ii = 0; 1361f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1362f1af5d2fSBarry Smith ic = 4*c[i]; 1363f1af5d2fSBarry Smith t[ii] = b[ic]; 1364f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1365f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1366f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1367f1af5d2fSBarry Smith ii += 4; 1368f1af5d2fSBarry Smith } 1369f1af5d2fSBarry Smith 1370f1af5d2fSBarry Smith /* forward solve the U^T */ 1371f1af5d2fSBarry Smith idx = 0; 1372f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1373f1af5d2fSBarry Smith 1374f1af5d2fSBarry Smith v = aa + 16*diag[i]; 1375f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1376f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1377f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1378f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1379f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1380f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1381f1af5d2fSBarry Smith v += 16; 1382f1af5d2fSBarry Smith 1383f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1384f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1385f1af5d2fSBarry Smith while (nz--) { 1386f1af5d2fSBarry Smith oidx = 4*(*vi++); 1387f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1388f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1389f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1390f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1391f1af5d2fSBarry Smith v += 16; 1392f1af5d2fSBarry Smith } 1393f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1394f1af5d2fSBarry Smith idx += 4; 1395f1af5d2fSBarry Smith } 1396f1af5d2fSBarry Smith /* backward solve the L^T */ 1397f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 1398f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 1399f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1400f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1401f1af5d2fSBarry Smith idt = 4*i; 1402f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1403f1af5d2fSBarry Smith while (nz--) { 1404f1af5d2fSBarry Smith idx = 4*(*vi--); 1405f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1406f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1407f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1408f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1409f1af5d2fSBarry Smith v -= 16; 1410f1af5d2fSBarry Smith } 1411f1af5d2fSBarry Smith } 1412f1af5d2fSBarry Smith 1413f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1414f1af5d2fSBarry Smith ii = 0; 1415f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1416f1af5d2fSBarry Smith ir = 4*r[i]; 1417f1af5d2fSBarry Smith x[ir] = t[ii]; 1418f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1419f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1420f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1421f1af5d2fSBarry Smith ii += 4; 1422f1af5d2fSBarry Smith } 1423f1af5d2fSBarry Smith 1424f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1425f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 14263649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 14271ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1428dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1429f1af5d2fSBarry Smith PetscFunctionReturn(0); 1430f1af5d2fSBarry Smith } 1431f1af5d2fSBarry Smith 14324a2ae208SSatish Balay #undef __FUNCT__ 14334dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 14344dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 143532121132SShri Abhyankar { 143632121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 143732121132SShri Abhyankar PetscErrorCode ierr; 143832121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1439b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 144032121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 144132121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1442b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1443b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1444b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1445b3260449SShri Abhyankar const PetscScalar *b; 144632121132SShri Abhyankar 144732121132SShri Abhyankar PetscFunctionBegin; 14483649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 144932121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 145032121132SShri Abhyankar t = a->solve_work; 145132121132SShri Abhyankar 145232121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 145332121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 145432121132SShri Abhyankar 145532121132SShri Abhyankar /* copy b into temp work space according to permutation */ 145632121132SShri Abhyankar for (i=0;i<n;i++) { 145732121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 145832121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 145932121132SShri Abhyankar } 146032121132SShri Abhyankar 146132121132SShri Abhyankar /* forward solve the U^T */ 146232121132SShri Abhyankar idx = 0; 146332121132SShri Abhyankar for (i=0; i<n; i++) { 146432121132SShri Abhyankar v = aa + bs2*diag[i]; 146532121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 146632121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 146732121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 146832121132SShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 146932121132SShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 147032121132SShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 147132121132SShri Abhyankar v -= bs2; 147232121132SShri Abhyankar 147332121132SShri Abhyankar vi = aj + diag[i] - 1; 147432121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 147532121132SShri Abhyankar for (j=0;j>-nz;j--) { 147632121132SShri Abhyankar oidx = bs*vi[j]; 147732121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 147832121132SShri Abhyankar t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 147932121132SShri Abhyankar t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 148032121132SShri Abhyankar t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 148132121132SShri Abhyankar v -= bs2; 148232121132SShri Abhyankar } 148332121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 148432121132SShri Abhyankar idx += bs; 148532121132SShri Abhyankar } 148632121132SShri Abhyankar /* backward solve the L^T */ 148732121132SShri Abhyankar for (i=n-1; i>=0; i--) { 148832121132SShri Abhyankar v = aa + bs2*ai[i]; 148932121132SShri Abhyankar vi = aj + ai[i]; 149032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 149132121132SShri Abhyankar idt = bs*i; 149232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 149332121132SShri Abhyankar for (j=0;j<nz;j++) { 149432121132SShri Abhyankar idx = bs*vi[j]; 149532121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 149632121132SShri Abhyankar t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 149732121132SShri Abhyankar t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 149832121132SShri Abhyankar t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 149932121132SShri Abhyankar v += bs2; 150032121132SShri Abhyankar } 150132121132SShri Abhyankar } 150232121132SShri Abhyankar 150332121132SShri Abhyankar /* copy t into x according to permutation */ 150432121132SShri Abhyankar for (i=0;i<n;i++) { 150532121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 150632121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 150732121132SShri Abhyankar } 150832121132SShri Abhyankar 150932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 151032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 15113649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 151232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 151332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 151432121132SShri Abhyankar PetscFunctionReturn(0); 151532121132SShri Abhyankar } 151632121132SShri Abhyankar 151732121132SShri Abhyankar #undef __FUNCT__ 151806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 151906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1520f1af5d2fSBarry Smith { 1521f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1522f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 15236849ba73SBarry Smith PetscErrorCode ierr; 15245d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1525b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1526b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1527b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1528b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1529b3260449SShri Abhyankar const PetscScalar *b; 1530f1af5d2fSBarry Smith 1531f1af5d2fSBarry Smith PetscFunctionBegin; 15323649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 15331ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1534f1af5d2fSBarry Smith t = a->solve_work; 1535f1af5d2fSBarry Smith 1536f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1537f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1538f1af5d2fSBarry Smith 1539f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1540f1af5d2fSBarry Smith ii = 0; 1541f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1542f1af5d2fSBarry Smith ic = 5*c[i]; 1543f1af5d2fSBarry Smith t[ii] = b[ic]; 1544f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1545f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1546f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1547f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1548f1af5d2fSBarry Smith ii += 5; 1549f1af5d2fSBarry Smith } 1550f1af5d2fSBarry Smith 1551f1af5d2fSBarry Smith /* forward solve the U^T */ 1552f1af5d2fSBarry Smith idx = 0; 1553f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1554f1af5d2fSBarry Smith 1555f1af5d2fSBarry Smith v = aa + 25*diag[i]; 1556f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1557f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1558f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1559f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1560f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1561f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1562f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1563f1af5d2fSBarry Smith v += 25; 1564f1af5d2fSBarry Smith 1565f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1566f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1567f1af5d2fSBarry Smith while (nz--) { 1568f1af5d2fSBarry Smith oidx = 5*(*vi++); 1569f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1570f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1571f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1572f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1573f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1574f1af5d2fSBarry Smith v += 25; 1575f1af5d2fSBarry Smith } 1576f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1577f1af5d2fSBarry Smith idx += 5; 1578f1af5d2fSBarry Smith } 1579f1af5d2fSBarry Smith /* backward solve the L^T */ 1580f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 1581f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 1582f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1583f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1584f1af5d2fSBarry Smith idt = 5*i; 1585f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1586f1af5d2fSBarry Smith while (nz--) { 1587f1af5d2fSBarry Smith idx = 5*(*vi--); 1588f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1589f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1590f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1591f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1592f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1593f1af5d2fSBarry Smith v -= 25; 1594f1af5d2fSBarry Smith } 1595f1af5d2fSBarry Smith } 1596f1af5d2fSBarry Smith 1597f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1598f1af5d2fSBarry Smith ii = 0; 1599f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1600f1af5d2fSBarry Smith ir = 5*r[i]; 1601f1af5d2fSBarry Smith x[ir] = t[ii]; 1602f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1603f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1604f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1605f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1606f1af5d2fSBarry Smith ii += 5; 1607f1af5d2fSBarry Smith } 1608f1af5d2fSBarry Smith 1609f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1610f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 16113649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 16121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1613dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1614f1af5d2fSBarry Smith PetscFunctionReturn(0); 1615f1af5d2fSBarry Smith } 1616f1af5d2fSBarry Smith 16174a2ae208SSatish Balay #undef __FUNCT__ 16184dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 16194dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 162032121132SShri Abhyankar { 162132121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 162232121132SShri Abhyankar PetscErrorCode ierr; 162332121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1624b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 162532121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 162632121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1627b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1628b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1629b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1630b3260449SShri Abhyankar const PetscScalar *b; 163132121132SShri Abhyankar 163232121132SShri Abhyankar PetscFunctionBegin; 16333649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 163432121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 163532121132SShri Abhyankar t = a->solve_work; 163632121132SShri Abhyankar 163732121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 163832121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 163932121132SShri Abhyankar 164032121132SShri Abhyankar /* copy b into temp work space according to permutation */ 164132121132SShri Abhyankar for (i=0;i<n;i++) { 164232121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 164332121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 164432121132SShri Abhyankar t[ii+4] = b[ic+4]; 164532121132SShri Abhyankar } 164632121132SShri Abhyankar 164732121132SShri Abhyankar /* forward solve the U^T */ 164832121132SShri Abhyankar idx = 0; 164932121132SShri Abhyankar for (i=0; i<n; i++) { 165032121132SShri Abhyankar v = aa + bs2*diag[i]; 165132121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 165232121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 165332121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 165432121132SShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 165532121132SShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 165632121132SShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 165732121132SShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 165832121132SShri Abhyankar v -= bs2; 165932121132SShri Abhyankar 166032121132SShri Abhyankar vi = aj + diag[i] - 1; 166132121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 166232121132SShri Abhyankar for (j=0;j>-nz;j--) { 166332121132SShri Abhyankar oidx = bs*vi[j]; 166432121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 166532121132SShri Abhyankar t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 166632121132SShri Abhyankar t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 166732121132SShri Abhyankar t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 166832121132SShri Abhyankar t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 166932121132SShri Abhyankar v -= bs2; 167032121132SShri Abhyankar } 167132121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 167232121132SShri Abhyankar idx += bs; 167332121132SShri Abhyankar } 167432121132SShri Abhyankar /* backward solve the L^T */ 167532121132SShri Abhyankar for (i=n-1; i>=0; i--) { 167632121132SShri Abhyankar v = aa + bs2*ai[i]; 167732121132SShri Abhyankar vi = aj + ai[i]; 167832121132SShri Abhyankar nz = ai[i+1] - ai[i]; 167932121132SShri Abhyankar idt = bs*i; 168032121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 168132121132SShri Abhyankar for (j=0;j<nz;j++) { 168232121132SShri Abhyankar idx = bs*vi[j]; 168332121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 168432121132SShri Abhyankar t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 168532121132SShri Abhyankar t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 168632121132SShri Abhyankar t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 168732121132SShri Abhyankar t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 168832121132SShri Abhyankar v += bs2; 168932121132SShri Abhyankar } 169032121132SShri Abhyankar } 169132121132SShri Abhyankar 169232121132SShri Abhyankar /* copy t into x according to permutation */ 169332121132SShri Abhyankar for (i=0;i<n;i++) { 169432121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 169532121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 169632121132SShri Abhyankar x[ir+4] = t[ii+4]; 169732121132SShri Abhyankar } 169832121132SShri Abhyankar 169932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 170032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 17013649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 170232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 170332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 170432121132SShri Abhyankar PetscFunctionReturn(0); 170532121132SShri Abhyankar } 170632121132SShri Abhyankar 170732121132SShri Abhyankar #undef __FUNCT__ 170806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 170906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1710f1af5d2fSBarry Smith { 1711f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1712f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 17136849ba73SBarry Smith PetscErrorCode ierr; 17145d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1715b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1716b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1717b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1718b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1719b3260449SShri Abhyankar const PetscScalar *b; 1720f1af5d2fSBarry Smith 1721f1af5d2fSBarry Smith PetscFunctionBegin; 17223649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 17231ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1724f1af5d2fSBarry Smith t = a->solve_work; 1725f1af5d2fSBarry Smith 1726f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1727f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1728f1af5d2fSBarry Smith 1729f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1730f1af5d2fSBarry Smith ii = 0; 1731f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1732f1af5d2fSBarry Smith ic = 6*c[i]; 1733f1af5d2fSBarry Smith t[ii] = b[ic]; 1734f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1735f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1736f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1737f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1738f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1739f1af5d2fSBarry Smith ii += 6; 1740f1af5d2fSBarry Smith } 1741f1af5d2fSBarry Smith 1742f1af5d2fSBarry Smith /* forward solve the U^T */ 1743f1af5d2fSBarry Smith idx = 0; 1744f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1745f1af5d2fSBarry Smith 1746f1af5d2fSBarry Smith v = aa + 36*diag[i]; 1747f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1748f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1749f1af5d2fSBarry Smith x6 = t[5+idx]; 1750f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1751f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1752f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1753f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1754f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1755f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1756f1af5d2fSBarry Smith v += 36; 1757f1af5d2fSBarry Smith 1758f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1759f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1760f1af5d2fSBarry Smith while (nz--) { 1761f1af5d2fSBarry Smith oidx = 6*(*vi++); 1762f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1763f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1764f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1765f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1766f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1767f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1768f1af5d2fSBarry Smith v += 36; 1769f1af5d2fSBarry Smith } 1770f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1771f1af5d2fSBarry Smith t[5+idx] = s6; 1772f1af5d2fSBarry Smith idx += 6; 1773f1af5d2fSBarry Smith } 1774f1af5d2fSBarry Smith /* backward solve the L^T */ 1775f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 1776f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 1777f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1778f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1779f1af5d2fSBarry Smith idt = 6*i; 1780f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1781f1af5d2fSBarry Smith s6 = t[5+idt]; 1782f1af5d2fSBarry Smith while (nz--) { 1783f1af5d2fSBarry Smith idx = 6*(*vi--); 1784f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1785f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1786f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1787f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1788f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1789f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1790f1af5d2fSBarry Smith v -= 36; 1791f1af5d2fSBarry Smith } 1792f1af5d2fSBarry Smith } 1793f1af5d2fSBarry Smith 1794f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1795f1af5d2fSBarry Smith ii = 0; 1796f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1797f1af5d2fSBarry Smith ir = 6*r[i]; 1798f1af5d2fSBarry Smith x[ir] = t[ii]; 1799f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1800f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1801f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1802f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1803f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1804f1af5d2fSBarry Smith ii += 6; 1805f1af5d2fSBarry Smith } 1806f1af5d2fSBarry Smith 1807f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1808f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 18093649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 18101ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1811dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1812f1af5d2fSBarry Smith PetscFunctionReturn(0); 1813f1af5d2fSBarry Smith } 1814f1af5d2fSBarry Smith 18154a2ae208SSatish Balay #undef __FUNCT__ 18164dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 18174dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 181832121132SShri Abhyankar { 181932121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 182032121132SShri Abhyankar PetscErrorCode ierr; 182132121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1822b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 182332121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 182432121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1825b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1826b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1827b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1828b3260449SShri Abhyankar const PetscScalar *b; 182932121132SShri Abhyankar 183032121132SShri Abhyankar PetscFunctionBegin; 18313649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 183232121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 183332121132SShri Abhyankar t = a->solve_work; 183432121132SShri Abhyankar 183532121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 183632121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 183732121132SShri Abhyankar 183832121132SShri Abhyankar /* copy b into temp work space according to permutation */ 183932121132SShri Abhyankar for (i=0;i<n;i++) { 184032121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 184132121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 184232121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 184332121132SShri Abhyankar } 184432121132SShri Abhyankar 184532121132SShri Abhyankar /* forward solve the U^T */ 184632121132SShri Abhyankar idx = 0; 184732121132SShri Abhyankar for (i=0; i<n; i++) { 184832121132SShri Abhyankar v = aa + bs2*diag[i]; 184932121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 185032121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 185132121132SShri Abhyankar x6 = t[5+idx]; 185232121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 185332121132SShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 185432121132SShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 185532121132SShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 185632121132SShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 185732121132SShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 185832121132SShri Abhyankar v -= bs2; 185932121132SShri Abhyankar 186032121132SShri Abhyankar vi = aj + diag[i] - 1; 186132121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 186232121132SShri Abhyankar for (j=0;j>-nz;j--) { 186332121132SShri Abhyankar oidx = bs*vi[j]; 186432121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 186532121132SShri Abhyankar t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 186632121132SShri Abhyankar t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 186732121132SShri Abhyankar t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 186832121132SShri Abhyankar t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 186932121132SShri Abhyankar t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 187032121132SShri Abhyankar v -= bs2; 187132121132SShri Abhyankar } 187232121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 187332121132SShri Abhyankar t[5+idx] = s6; 187432121132SShri Abhyankar idx += bs; 187532121132SShri Abhyankar } 187632121132SShri Abhyankar /* backward solve the L^T */ 187732121132SShri Abhyankar for (i=n-1; i>=0; i--) { 187832121132SShri Abhyankar v = aa + bs2*ai[i]; 187932121132SShri Abhyankar vi = aj + ai[i]; 188032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 188132121132SShri Abhyankar idt = bs*i; 188232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 188332121132SShri Abhyankar s6 = t[5+idt]; 188432121132SShri Abhyankar for (j=0;j<nz;j++) { 188532121132SShri Abhyankar idx = bs*vi[j]; 188632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 188732121132SShri Abhyankar t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 188832121132SShri Abhyankar t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 188932121132SShri Abhyankar t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 189032121132SShri Abhyankar t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 189132121132SShri Abhyankar t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 189232121132SShri Abhyankar v += bs2; 189332121132SShri Abhyankar } 189432121132SShri Abhyankar } 189532121132SShri Abhyankar 189632121132SShri Abhyankar /* copy t into x according to permutation */ 189732121132SShri Abhyankar for (i=0;i<n;i++) { 189832121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 189932121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 190032121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 190132121132SShri Abhyankar } 190232121132SShri Abhyankar 190332121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 190432121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 19053649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 190632121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 190732121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 190832121132SShri Abhyankar PetscFunctionReturn(0); 190932121132SShri Abhyankar } 191032121132SShri Abhyankar 191132121132SShri Abhyankar #undef __FUNCT__ 191206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 191306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1914f1af5d2fSBarry Smith { 1915f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1916f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 19176849ba73SBarry Smith PetscErrorCode ierr; 19185d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1919b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1920b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1921b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1922b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1923b3260449SShri Abhyankar const PetscScalar *b; 1924f1af5d2fSBarry Smith 1925f1af5d2fSBarry Smith PetscFunctionBegin; 19263649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 19271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1928f1af5d2fSBarry Smith t = a->solve_work; 1929f1af5d2fSBarry Smith 1930f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1931f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1932f1af5d2fSBarry Smith 1933f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1934f1af5d2fSBarry Smith ii = 0; 1935f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1936f1af5d2fSBarry Smith ic = 7*c[i]; 1937f1af5d2fSBarry Smith t[ii] = b[ic]; 1938f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1939f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1940f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1941f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1942f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1943f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1944f1af5d2fSBarry Smith ii += 7; 1945f1af5d2fSBarry Smith } 1946f1af5d2fSBarry Smith 1947f1af5d2fSBarry Smith /* forward solve the U^T */ 1948f1af5d2fSBarry Smith idx = 0; 1949f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1950f1af5d2fSBarry Smith 1951f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1952f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1953f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1954f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1955f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1956f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1957f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1958f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1959f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1960f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1961f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1962f1af5d2fSBarry Smith v += 49; 1963f1af5d2fSBarry Smith 1964f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1965f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1966f1af5d2fSBarry Smith while (nz--) { 1967f1af5d2fSBarry Smith oidx = 7*(*vi++); 1968f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1969f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1970f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1971f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1972f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1973f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1974f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1975f1af5d2fSBarry Smith v += 49; 1976f1af5d2fSBarry Smith } 1977f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1978f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1979f1af5d2fSBarry Smith idx += 7; 1980f1af5d2fSBarry Smith } 1981f1af5d2fSBarry Smith /* backward solve the L^T */ 1982f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 1983f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1984f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1985f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1986f1af5d2fSBarry Smith idt = 7*i; 1987f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1988f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1989f1af5d2fSBarry Smith while (nz--) { 1990f1af5d2fSBarry Smith idx = 7*(*vi--); 1991f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1992f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1993f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1994f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1995f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1996f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1997f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1998f1af5d2fSBarry Smith v -= 49; 1999f1af5d2fSBarry Smith } 2000f1af5d2fSBarry Smith } 2001f1af5d2fSBarry Smith 2002f1af5d2fSBarry Smith /* copy t into x according to permutation */ 2003f1af5d2fSBarry Smith ii = 0; 2004f1af5d2fSBarry Smith for (i=0; i<n; i++) { 2005f1af5d2fSBarry Smith ir = 7*r[i]; 2006f1af5d2fSBarry Smith x[ir] = t[ii]; 2007f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 2008f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 2009f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 2010f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 2011f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 2012f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 2013f1af5d2fSBarry Smith ii += 7; 2014f1af5d2fSBarry Smith } 2015f1af5d2fSBarry Smith 2016f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2017f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 20183649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 20191ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2020dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2021f1af5d2fSBarry Smith PetscFunctionReturn(0); 2022f1af5d2fSBarry Smith } 202332121132SShri Abhyankar #undef __FUNCT__ 20244dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 20254dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 202632121132SShri Abhyankar { 202732121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 202832121132SShri Abhyankar PetscErrorCode ierr; 202932121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 2030b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 203132121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 203232121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 2033b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2034b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2035b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2036b3260449SShri Abhyankar const PetscScalar *b; 203732121132SShri Abhyankar 203832121132SShri Abhyankar PetscFunctionBegin; 20393649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 204032121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 204132121132SShri Abhyankar t = a->solve_work; 204232121132SShri Abhyankar 204332121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 204432121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 204532121132SShri Abhyankar 204632121132SShri Abhyankar /* copy b into temp work space according to permutation */ 204732121132SShri Abhyankar for (i=0;i<n;i++) { 204832121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 204932121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 205032121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 205132121132SShri Abhyankar } 205232121132SShri Abhyankar 205332121132SShri Abhyankar /* forward solve the U^T */ 205432121132SShri Abhyankar idx = 0; 205532121132SShri Abhyankar for (i=0; i<n; i++) { 205632121132SShri Abhyankar v = aa + bs2*diag[i]; 205732121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 205832121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 205932121132SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 206032121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 206132121132SShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 206232121132SShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 206332121132SShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 206432121132SShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 206532121132SShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 206632121132SShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 206732121132SShri Abhyankar v -= bs2; 206832121132SShri Abhyankar 206932121132SShri Abhyankar vi = aj + diag[i] - 1; 207032121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 207132121132SShri Abhyankar for (j=0;j>-nz;j--) { 207232121132SShri Abhyankar oidx = bs*vi[j]; 207332121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 207432121132SShri Abhyankar t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 207532121132SShri Abhyankar t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 207632121132SShri Abhyankar t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 207732121132SShri Abhyankar t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 207832121132SShri Abhyankar t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 207932121132SShri Abhyankar t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 208032121132SShri Abhyankar v -= bs2; 208132121132SShri Abhyankar } 208232121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 208332121132SShri Abhyankar t[5+idx] = s6; t[6+idx] = s7; 208432121132SShri Abhyankar idx += bs; 208532121132SShri Abhyankar } 208632121132SShri Abhyankar /* backward solve the L^T */ 208732121132SShri Abhyankar for (i=n-1; i>=0; i--) { 208832121132SShri Abhyankar v = aa + bs2*ai[i]; 208932121132SShri Abhyankar vi = aj + ai[i]; 209032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 209132121132SShri Abhyankar idt = bs*i; 209232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 209332121132SShri Abhyankar s6 = t[5+idt]; s7 = t[6+idt]; 209432121132SShri Abhyankar for (j=0;j<nz;j++) { 209532121132SShri Abhyankar idx = bs*vi[j]; 209632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 209732121132SShri Abhyankar t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 209832121132SShri Abhyankar t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 209932121132SShri Abhyankar t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 210032121132SShri Abhyankar t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 210132121132SShri Abhyankar t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 210232121132SShri Abhyankar t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 210332121132SShri Abhyankar v += bs2; 210432121132SShri Abhyankar } 210532121132SShri Abhyankar } 210632121132SShri Abhyankar 210732121132SShri Abhyankar /* copy t into x according to permutation */ 210832121132SShri Abhyankar for (i=0;i<n;i++) { 210932121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 211032121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 211132121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 211232121132SShri Abhyankar } 211332121132SShri Abhyankar 211432121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 211532121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21163649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 211732121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 211832121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 211932121132SShri Abhyankar PetscFunctionReturn(0); 212032121132SShri Abhyankar } 2121f1af5d2fSBarry Smith 21224e2b4712SSatish Balay /* ----------------------------------------------------------- */ 21234a2ae208SSatish Balay #undef __FUNCT__ 212406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 212506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 21264e2b4712SSatish Balay { 21274e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21284e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 21296849ba73SBarry Smith PetscErrorCode ierr; 2130b3260449SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 2131b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 2132b3260449SShri Abhyankar PetscInt i,nz; 2133b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2134b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2135b3260449SShri Abhyankar PetscScalar *x,*s,*t,*ls; 2136b3260449SShri Abhyankar const PetscScalar *b; 21374e2b4712SSatish Balay 21384e2b4712SSatish Balay PetscFunctionBegin; 21393649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 21401ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2141f1af5d2fSBarry Smith t = a->solve_work; 21424e2b4712SSatish Balay 21434e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 21444e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 21454e2b4712SSatish Balay 21464e2b4712SSatish Balay /* forward solve the lower triangular */ 214787828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 21484e2b4712SSatish Balay for (i=1; i<n; i++) { 21494e2b4712SSatish Balay v = aa + bs2*ai[i]; 21504e2b4712SSatish Balay vi = aj + ai[i]; 21514e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 2152f1af5d2fSBarry Smith s = t + bs*i; 215387828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 21544e2b4712SSatish Balay while (nz--) { 215596b95a6bSBarry Smith PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 21564e2b4712SSatish Balay v += bs2; 21574e2b4712SSatish Balay } 21584e2b4712SSatish Balay } 21594e2b4712SSatish Balay /* backward solve the upper triangular */ 2160d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 21614e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 21624e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 21634e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 21644e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 216587828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 21664e2b4712SSatish Balay while (nz--) { 216796b95a6bSBarry Smith PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 21684e2b4712SSatish Balay v += bs2; 21694e2b4712SSatish Balay } 217096b95a6bSBarry Smith PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 217187828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 21724e2b4712SSatish Balay } 21734e2b4712SSatish Balay 21744e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21754e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21763649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 21771ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2178dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 21794e2b4712SSatish Balay PetscFunctionReturn(0); 21804e2b4712SSatish Balay } 21814e2b4712SSatish Balay 21825c42ef9dSBarry Smith /* ----------------------------------------------------------- */ 21835c42ef9dSBarry Smith #undef __FUNCT__ 218406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 218506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 21865c42ef9dSBarry Smith { 21875c42ef9dSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21885c42ef9dSBarry Smith IS iscol=a->col,isrow=a->row; 21895c42ef9dSBarry Smith PetscErrorCode ierr; 21905c42ef9dSBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2191b3260449SShri Abhyankar PetscInt i,nz,j; 2192b3260449SShri Abhyankar const PetscInt n=a->mbs,bs=A->rmap->bs,bs2=a->bs2; 21935c42ef9dSBarry Smith const MatScalar *aa=a->a,*v; 21945c42ef9dSBarry Smith PetscScalar *x,*t,*ls; 21955c42ef9dSBarry Smith const PetscScalar *b; 2196*6e111a19SKarl Rupp 21975c42ef9dSBarry Smith PetscFunctionBegin; 21983649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 21995c42ef9dSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22005c42ef9dSBarry Smith t = a->solve_work; 22015c42ef9dSBarry Smith 22025c42ef9dSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 22035c42ef9dSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 22045c42ef9dSBarry Smith 22055c42ef9dSBarry Smith /* copy the b into temp work space according to permutation */ 22065c42ef9dSBarry Smith for (i=0; i<n; i++) { 22075c42ef9dSBarry Smith for (j=0; j<bs; j++) { 22085c42ef9dSBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 22095c42ef9dSBarry Smith } 22105c42ef9dSBarry Smith } 22115c42ef9dSBarry Smith 22125c42ef9dSBarry Smith 22135c42ef9dSBarry Smith /* forward solve the upper triangular transpose */ 22145c42ef9dSBarry Smith ls = a->solve_work + A->cmap->n; 22155c42ef9dSBarry Smith for (i=0; i<n; i++) { 22165c42ef9dSBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 221796b95a6bSBarry Smith PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 22185c42ef9dSBarry Smith v = aa + bs2*(a->diag[i] + 1); 22195c42ef9dSBarry Smith vi = aj + a->diag[i] + 1; 22205c42ef9dSBarry Smith nz = ai[i+1] - a->diag[i] - 1; 22215c42ef9dSBarry Smith while (nz--) { 222296b95a6bSBarry Smith PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 22235c42ef9dSBarry Smith v += bs2; 22245c42ef9dSBarry Smith } 22255c42ef9dSBarry Smith } 22265c42ef9dSBarry Smith 22275c42ef9dSBarry Smith /* backward solve the lower triangular transpose */ 22285c42ef9dSBarry Smith for (i=n-1; i>=0; i--) { 22295c42ef9dSBarry Smith v = aa + bs2*ai[i]; 22305c42ef9dSBarry Smith vi = aj + ai[i]; 22315c42ef9dSBarry Smith nz = a->diag[i] - ai[i]; 22325c42ef9dSBarry Smith while (nz--) { 223396b95a6bSBarry Smith PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 22345c42ef9dSBarry Smith v += bs2; 22355c42ef9dSBarry Smith } 22365c42ef9dSBarry Smith } 22375c42ef9dSBarry Smith 22385c42ef9dSBarry Smith /* copy t into x according to permutation */ 22395c42ef9dSBarry Smith for (i=0; i<n; i++) { 22405c42ef9dSBarry Smith for (j=0; j<bs; j++) { 22415c42ef9dSBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 22425c42ef9dSBarry Smith } 22435c42ef9dSBarry Smith } 22445c42ef9dSBarry Smith 22455c42ef9dSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22465c42ef9dSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 22473649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 22485c42ef9dSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 22495c42ef9dSBarry Smith ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 22505c42ef9dSBarry Smith PetscFunctionReturn(0); 22515c42ef9dSBarry Smith } 22525c42ef9dSBarry Smith 22534a2ae208SSatish Balay #undef __FUNCT__ 22544dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 22554dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 22568499736aSShri Abhyankar { 22578499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 22588499736aSShri Abhyankar IS iscol=a->col,isrow=a->row; 22598499736aSShri Abhyankar PetscErrorCode ierr; 2260b3260449SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 2261b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2262b3260449SShri Abhyankar PetscInt i,j,nz; 2263b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 22648499736aSShri Abhyankar const MatScalar *aa=a->a,*v; 22658499736aSShri Abhyankar PetscScalar *x,*t,*ls; 22668499736aSShri Abhyankar const PetscScalar *b; 2267b3260449SShri Abhyankar 22688499736aSShri Abhyankar PetscFunctionBegin; 22693649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 22708499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22718499736aSShri Abhyankar t = a->solve_work; 22728499736aSShri Abhyankar 22738499736aSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 22748499736aSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 22758499736aSShri Abhyankar 22768499736aSShri Abhyankar /* copy the b into temp work space according to permutation */ 22778499736aSShri Abhyankar for (i=0; i<n; i++) { 22788499736aSShri Abhyankar for (j=0; j<bs; j++) { 22798499736aSShri Abhyankar t[i*bs+j] = b[c[i]*bs+j]; 22808499736aSShri Abhyankar } 22818499736aSShri Abhyankar } 22828499736aSShri Abhyankar 22838499736aSShri Abhyankar 22848499736aSShri Abhyankar /* forward solve the upper triangular transpose */ 22858499736aSShri Abhyankar ls = a->solve_work + A->cmap->n; 22868499736aSShri Abhyankar for (i=0; i<n; i++) { 22878499736aSShri Abhyankar ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 228896b95a6bSBarry Smith PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 22898499736aSShri Abhyankar v = aa + bs2*(diag[i] - 1); 22908499736aSShri Abhyankar vi = aj + diag[i] - 1; 22918499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 22928499736aSShri Abhyankar for (j=0;j>-nz;j--) { 229396b95a6bSBarry Smith PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 22948499736aSShri Abhyankar v -= bs2; 22958499736aSShri Abhyankar } 22968499736aSShri Abhyankar } 22978499736aSShri Abhyankar 22988499736aSShri Abhyankar /* backward solve the lower triangular transpose */ 22998499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 23008499736aSShri Abhyankar v = aa + bs2*ai[i]; 23018499736aSShri Abhyankar vi = aj + ai[i]; 23028499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 23038499736aSShri Abhyankar for (j=0;j<nz;j++) { 230496b95a6bSBarry Smith PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 23058499736aSShri Abhyankar v += bs2; 23068499736aSShri Abhyankar } 23078499736aSShri Abhyankar } 23088499736aSShri Abhyankar 23098499736aSShri Abhyankar /* copy t into x according to permutation */ 23108499736aSShri Abhyankar for (i=0; i<n; i++) { 23118499736aSShri Abhyankar for (j=0; j<bs; j++) { 23128499736aSShri Abhyankar x[bs*r[i]+j] = t[bs*i+j]; 23138499736aSShri Abhyankar } 23148499736aSShri Abhyankar } 23158499736aSShri Abhyankar 23168499736aSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 23178499736aSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 23183649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 23198499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 23208499736aSShri Abhyankar ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 23218499736aSShri Abhyankar PetscFunctionReturn(0); 23228499736aSShri Abhyankar } 23238499736aSShri Abhyankar 2324832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */ 232529a97285SShri Abhyankar 23262b0b2ea7SShri Abhyankar #undef __FUNCT__ 2327832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2" 2328832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx) 23292b0b2ea7SShri Abhyankar { 23302b0b2ea7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 23312b0b2ea7SShri Abhyankar PetscErrorCode ierr; 2332b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 23330fa040f9SShri Abhyankar PetscInt i,nz,idx,idt,m; 23340b68f018SBarry Smith const MatScalar *aa=a->a,*v; 23352b0b2ea7SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 23362b0b2ea7SShri Abhyankar PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 23370fa040f9SShri Abhyankar PetscScalar *x; 23380b68f018SBarry Smith const PetscScalar *b; 23392b0b2ea7SShri Abhyankar 23402b0b2ea7SShri Abhyankar PetscFunctionBegin; 23413649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 23422b0b2ea7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23432b0b2ea7SShri Abhyankar 23442b0b2ea7SShri Abhyankar /* forward solve the lower triangular */ 234529a97285SShri Abhyankar idx = 0; 23460fa040f9SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx]; 23470fa040f9SShri Abhyankar x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx]; 23480fa040f9SShri Abhyankar x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx]; 23492b0b2ea7SShri Abhyankar 23502b0b2ea7SShri Abhyankar for (i=1; i<n; i++) { 23512b0b2ea7SShri Abhyankar v = aa + bs2*ai[i]; 23522b0b2ea7SShri Abhyankar vi = aj + ai[i]; 23532b0b2ea7SShri Abhyankar nz = ai[i+1] - ai[i]; 23540fa040f9SShri Abhyankar idt = bs*i; 23550fa040f9SShri Abhyankar s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt]; 23560fa040f9SShri Abhyankar s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt]; 23570fa040f9SShri Abhyankar s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt]; 23582b0b2ea7SShri Abhyankar for (m=0;m<nz;m++) { 23592b0b2ea7SShri Abhyankar idx = bs*vi[m]; 23600fa040f9SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 23610fa040f9SShri Abhyankar x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 23620fa040f9SShri Abhyankar x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 23632b0b2ea7SShri Abhyankar 23640b8f6341SShri Abhyankar 23652b0b2ea7SShri Abhyankar s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 23662b0b2ea7SShri Abhyankar s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 23672b0b2ea7SShri Abhyankar s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 23682b0b2ea7SShri Abhyankar s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 23692b0b2ea7SShri Abhyankar s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 23702b0b2ea7SShri Abhyankar s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 23712b0b2ea7SShri Abhyankar s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 23722b0b2ea7SShri Abhyankar s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 23732b0b2ea7SShri Abhyankar s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 23742b0b2ea7SShri Abhyankar s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 23752b0b2ea7SShri Abhyankar s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 23762b0b2ea7SShri Abhyankar s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 23772b0b2ea7SShri Abhyankar s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 23782b0b2ea7SShri Abhyankar s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 23792b0b2ea7SShri Abhyankar s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 23802b0b2ea7SShri Abhyankar 23812b0b2ea7SShri Abhyankar v += bs2; 23822b0b2ea7SShri Abhyankar } 23830fa040f9SShri Abhyankar x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5; 23840fa040f9SShri Abhyankar x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10; 23850fa040f9SShri Abhyankar x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15; 23862b0b2ea7SShri Abhyankar 23872b0b2ea7SShri Abhyankar } 23882b0b2ea7SShri Abhyankar /* backward solve the upper triangular */ 23892b0b2ea7SShri Abhyankar for (i=n-1; i>=0; i--) { 23902b0b2ea7SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 23912b0b2ea7SShri Abhyankar vi = aj + adiag[i+1]+1; 23922b0b2ea7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 23932b0b2ea7SShri Abhyankar idt = bs*i; 23940fa040f9SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 23950fa040f9SShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt]; 23960fa040f9SShri Abhyankar s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt]; 23972b0b2ea7SShri Abhyankar 23982b0b2ea7SShri Abhyankar for (m=0;m<nz;m++) { 23992b0b2ea7SShri Abhyankar idx = bs*vi[m]; 24000fa040f9SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 24010fa040f9SShri Abhyankar x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 24020fa040f9SShri Abhyankar x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 24032b0b2ea7SShri Abhyankar 24042b0b2ea7SShri Abhyankar s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 24052b0b2ea7SShri Abhyankar s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 24062b0b2ea7SShri Abhyankar s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 24072b0b2ea7SShri Abhyankar s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 24082b0b2ea7SShri Abhyankar s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 24092b0b2ea7SShri Abhyankar s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 24102b0b2ea7SShri Abhyankar s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 24112b0b2ea7SShri Abhyankar s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 24122b0b2ea7SShri Abhyankar s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 24132b0b2ea7SShri Abhyankar s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 24142b0b2ea7SShri Abhyankar s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 24152b0b2ea7SShri Abhyankar s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 24162b0b2ea7SShri Abhyankar s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 24172b0b2ea7SShri Abhyankar s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 24182b0b2ea7SShri Abhyankar s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 24192b0b2ea7SShri Abhyankar 24202b0b2ea7SShri Abhyankar v += bs2; 24212b0b2ea7SShri Abhyankar } 24222b0b2ea7SShri Abhyankar 24230fa040f9SShri Abhyankar x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 24240fa040f9SShri Abhyankar x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 24250fa040f9SShri Abhyankar x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 24260fa040f9SShri Abhyankar x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 24270fa040f9SShri Abhyankar x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 24280fa040f9SShri Abhyankar x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 24290fa040f9SShri Abhyankar x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 24300fa040f9SShri Abhyankar x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 24310fa040f9SShri Abhyankar x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 24320fa040f9SShri Abhyankar x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 24330fa040f9SShri Abhyankar x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 24340fa040f9SShri Abhyankar x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 24350fa040f9SShri Abhyankar x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 24360fa040f9SShri Abhyankar x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 24370fa040f9SShri Abhyankar x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 24382b0b2ea7SShri Abhyankar 24392b0b2ea7SShri Abhyankar } 24402b0b2ea7SShri Abhyankar 24413649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 24422b0b2ea7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 24432b0b2ea7SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 24442b0b2ea7SShri Abhyankar PetscFunctionReturn(0); 24452b0b2ea7SShri Abhyankar } 24462b0b2ea7SShri Abhyankar 2447832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */ 2448832cc040SShri Abhyankar /* Default MatSolve for block size 15 */ 2449832cc040SShri Abhyankar 24508499736aSShri Abhyankar #undef __FUNCT__ 2451832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1" 2452832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx) 24530b8f6341SShri Abhyankar { 24540b8f6341SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 24550b8f6341SShri Abhyankar PetscErrorCode ierr; 24560b8f6341SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 245753ef36baSBarry Smith PetscInt i,k,nz,idx,idt,m; 24580b8f6341SShri Abhyankar const MatScalar *aa=a->a,*v; 24590b8f6341SShri Abhyankar PetscScalar s[15]; 246053ef36baSBarry Smith PetscScalar *x,xv; 24610b8f6341SShri Abhyankar const PetscScalar *b; 24620b8f6341SShri Abhyankar 24630b8f6341SShri Abhyankar PetscFunctionBegin; 24643649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 24650b8f6341SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 24660b8f6341SShri Abhyankar 24670b8f6341SShri Abhyankar /* forward solve the lower triangular */ 2468832cc040SShri Abhyankar for (i=0; i<n; i++) { 24690b8f6341SShri Abhyankar v = aa + bs2*ai[i]; 24700b8f6341SShri Abhyankar vi = aj + ai[i]; 24710b8f6341SShri Abhyankar nz = ai[i+1] - ai[i]; 24720fa040f9SShri Abhyankar idt = bs*i; 2473832cc040SShri Abhyankar x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 2474832cc040SShri Abhyankar x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 2475832cc040SShri Abhyankar x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt]; 24760b8f6341SShri Abhyankar for (m=0;m<nz;m++) { 24770b8f6341SShri Abhyankar idx = bs*vi[m]; 24780b8f6341SShri Abhyankar for (k=0;k<15;k++) { 247953ef36baSBarry Smith xv = x[k + idx]; 248053ef36baSBarry Smith x[idt] -= v[0]*xv; 248153ef36baSBarry Smith x[1+idt] -= v[1]*xv; 248253ef36baSBarry Smith x[2+idt] -= v[2]*xv; 248353ef36baSBarry Smith x[3+idt] -= v[3]*xv; 248453ef36baSBarry Smith x[4+idt] -= v[4]*xv; 248553ef36baSBarry Smith x[5+idt] -= v[5]*xv; 248653ef36baSBarry Smith x[6+idt] -= v[6]*xv; 248753ef36baSBarry Smith x[7+idt] -= v[7]*xv; 248853ef36baSBarry Smith x[8+idt] -= v[8]*xv; 248953ef36baSBarry Smith x[9+idt] -= v[9]*xv; 249053ef36baSBarry Smith x[10+idt] -= v[10]*xv; 249153ef36baSBarry Smith x[11+idt] -= v[11]*xv; 249253ef36baSBarry Smith x[12+idt] -= v[12]*xv; 249353ef36baSBarry Smith x[13+idt] -= v[13]*xv; 249453ef36baSBarry Smith x[14+idt] -= v[14]*xv; 24950b8f6341SShri Abhyankar v += 15; 24960b8f6341SShri Abhyankar } 24970b8f6341SShri Abhyankar } 24980b8f6341SShri Abhyankar } 24990b8f6341SShri Abhyankar /* backward solve the upper triangular */ 25000b8f6341SShri Abhyankar for (i=n-1; i>=0; i--) { 25010b8f6341SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 25020b8f6341SShri Abhyankar vi = aj + adiag[i+1]+1; 25030b8f6341SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 25040b8f6341SShri Abhyankar idt = bs*i; 25050fa040f9SShri Abhyankar s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 25060fa040f9SShri Abhyankar s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 25070fa040f9SShri Abhyankar s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt]; 25080b8f6341SShri Abhyankar 25090b8f6341SShri Abhyankar for (m=0;m<nz;m++) { 25100b8f6341SShri Abhyankar idx = bs*vi[m]; 25110b8f6341SShri Abhyankar for (k=0;k<15;k++) { 251253ef36baSBarry Smith xv = x[k + idx]; 251353ef36baSBarry Smith s[0] -= v[0]*xv; 251453ef36baSBarry Smith s[1] -= v[1]*xv; 251553ef36baSBarry Smith s[2] -= v[2]*xv; 251653ef36baSBarry Smith s[3] -= v[3]*xv; 251753ef36baSBarry Smith s[4] -= v[4]*xv; 251853ef36baSBarry Smith s[5] -= v[5]*xv; 251953ef36baSBarry Smith s[6] -= v[6]*xv; 252053ef36baSBarry Smith s[7] -= v[7]*xv; 252153ef36baSBarry Smith s[8] -= v[8]*xv; 252253ef36baSBarry Smith s[9] -= v[9]*xv; 252353ef36baSBarry Smith s[10] -= v[10]*xv; 252453ef36baSBarry Smith s[11] -= v[11]*xv; 252553ef36baSBarry Smith s[12] -= v[12]*xv; 252653ef36baSBarry Smith s[13] -= v[13]*xv; 252753ef36baSBarry Smith s[14] -= v[14]*xv; 25280b8f6341SShri Abhyankar v += 15; 25290b8f6341SShri Abhyankar } 25300b8f6341SShri Abhyankar } 25310fa040f9SShri Abhyankar ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr); 25320b8f6341SShri Abhyankar for (k=0;k<15;k++) { 25330fa040f9SShri Abhyankar x[idt] += v[0]*s[k]; 25340fa040f9SShri Abhyankar x[1+idt] += v[1]*s[k]; 25350fa040f9SShri Abhyankar x[2+idt] += v[2]*s[k]; 25360fa040f9SShri Abhyankar x[3+idt] += v[3]*s[k]; 25370fa040f9SShri Abhyankar x[4+idt] += v[4]*s[k]; 25380fa040f9SShri Abhyankar x[5+idt] += v[5]*s[k]; 25390fa040f9SShri Abhyankar x[6+idt] += v[6]*s[k]; 25400fa040f9SShri Abhyankar x[7+idt] += v[7]*s[k]; 25410fa040f9SShri Abhyankar x[8+idt] += v[8]*s[k]; 25420fa040f9SShri Abhyankar x[9+idt] += v[9]*s[k]; 25430fa040f9SShri Abhyankar x[10+idt] += v[10]*s[k]; 25440fa040f9SShri Abhyankar x[11+idt] += v[11]*s[k]; 25450fa040f9SShri Abhyankar x[12+idt] += v[12]*s[k]; 25460fa040f9SShri Abhyankar x[13+idt] += v[13]*s[k]; 25470fa040f9SShri Abhyankar x[14+idt] += v[14]*s[k]; 25480b8f6341SShri Abhyankar v += 15; 25490b8f6341SShri Abhyankar } 25500b8f6341SShri Abhyankar } 25513649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 25520b8f6341SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 25530b8f6341SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 25540b8f6341SShri Abhyankar PetscFunctionReturn(0); 25550b8f6341SShri Abhyankar } 25560b8f6341SShri Abhyankar 25570b8f6341SShri Abhyankar 25580b8f6341SShri Abhyankar #undef __FUNCT__ 255906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 256006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 25614e2b4712SSatish Balay { 25624e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 25634e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 25646849ba73SBarry Smith PetscErrorCode ierr; 2565b3260449SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j; 2566b3260449SShri Abhyankar const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 2567b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 2568b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2569b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2570b3260449SShri Abhyankar const PetscScalar *b; 25714e2b4712SSatish Balay 25724e2b4712SSatish Balay PetscFunctionBegin; 25733649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 25741ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2575f1af5d2fSBarry Smith t = a->solve_work; 25764e2b4712SSatish Balay 25774e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 25784e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 25794e2b4712SSatish Balay 25804e2b4712SSatish Balay /* forward solve the lower triangular */ 25814e2b4712SSatish Balay idx = 7*(*r++); 2582f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2583f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2584f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 25854e2b4712SSatish Balay 25864e2b4712SSatish Balay for (i=1; i<n; i++) { 25874e2b4712SSatish Balay v = aa + 49*ai[i]; 25884e2b4712SSatish Balay vi = aj + ai[i]; 25894e2b4712SSatish Balay nz = diag[i] - ai[i]; 25904e2b4712SSatish Balay idx = 7*(*r++); 2591f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2592f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 25934e2b4712SSatish Balay while (nz--) { 25944e2b4712SSatish Balay idx = 7*(*vi++); 2595f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2596f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2597f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 2598f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2599f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2600f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2601f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2602f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2603f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2604f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 26054e2b4712SSatish Balay v += 49; 26064e2b4712SSatish Balay } 26074e2b4712SSatish Balay idx = 7*i; 2608f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2609f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2610f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 26114e2b4712SSatish Balay } 26124e2b4712SSatish Balay /* backward solve the upper triangular */ 26134e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 26144e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 26154e2b4712SSatish Balay vi = aj + diag[i] + 1; 26164e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 26174e2b4712SSatish Balay idt = 7*i; 2618f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2619f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2620f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 26214e2b4712SSatish Balay while (nz--) { 26224e2b4712SSatish Balay idx = 7*(*vi++); 2623f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2624f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2625f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 2626f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2627f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2628f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2629f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2630f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2631f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2632f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 26334e2b4712SSatish Balay v += 49; 26344e2b4712SSatish Balay } 26354e2b4712SSatish Balay idc = 7*(*c--); 26364e2b4712SSatish Balay v = aa + 49*diag[i]; 2637f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2638f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2639f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2640f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2641f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2642f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2643f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2644f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2645f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2646f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2647f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2648f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2649f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2650f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 26514e2b4712SSatish Balay } 26524e2b4712SSatish Balay 26534e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 26544e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 26553649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 26561ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2657dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 26584e2b4712SSatish Balay PetscFunctionReturn(0); 26594e2b4712SSatish Balay } 26604e2b4712SSatish Balay 26618f690400SShri Abhyankar #undef __FUNCT__ 26624dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7" 26634dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 266435aa4fcfSShri Abhyankar { 266535aa4fcfSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 266635aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 266735aa4fcfSShri Abhyankar PetscErrorCode ierr; 2668b3260449SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 2669b3260449SShri Abhyankar const PetscInt n=a->mbs,*rout,*cout,*vi; 2670b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 2671b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2672b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2673b3260449SShri Abhyankar const PetscScalar *b; 267435aa4fcfSShri Abhyankar 267535aa4fcfSShri Abhyankar PetscFunctionBegin; 26763649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 267735aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 267835aa4fcfSShri Abhyankar t = a->solve_work; 267935aa4fcfSShri Abhyankar 268035aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 268135aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 268235aa4fcfSShri Abhyankar 268335aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 268435aa4fcfSShri Abhyankar idx = 7*r[0]; 268535aa4fcfSShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 268635aa4fcfSShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 268735aa4fcfSShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 268835aa4fcfSShri Abhyankar 268935aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 269035aa4fcfSShri Abhyankar v = aa + 49*ai[i]; 269135aa4fcfSShri Abhyankar vi = aj + ai[i]; 269235aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 269335aa4fcfSShri Abhyankar idx = 7*r[i]; 269435aa4fcfSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 269535aa4fcfSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 269635aa4fcfSShri Abhyankar for (m=0;m<nz;m++) { 269735aa4fcfSShri Abhyankar idx = 7*vi[m]; 269835aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 269935aa4fcfSShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 270035aa4fcfSShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 270135aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 270235aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 270335aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 270435aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 270535aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 270635aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 270735aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 270835aa4fcfSShri Abhyankar v += 49; 270935aa4fcfSShri Abhyankar } 271035aa4fcfSShri Abhyankar idx = 7*i; 271135aa4fcfSShri Abhyankar t[idx] = s1;t[1+idx] = s2; 271235aa4fcfSShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 271335aa4fcfSShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 271435aa4fcfSShri Abhyankar } 271535aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 271635aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--) { 271735aa4fcfSShri Abhyankar v = aa + 49*(adiag[i+1]+1); 271835aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 271935aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 272035aa4fcfSShri Abhyankar idt = 7*i; 272135aa4fcfSShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 272235aa4fcfSShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 272335aa4fcfSShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 272435aa4fcfSShri Abhyankar for (m=0;m<nz;m++) { 272535aa4fcfSShri Abhyankar idx = 7*vi[m]; 272635aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 272735aa4fcfSShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 272835aa4fcfSShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 272935aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 273035aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 273135aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 273235aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 273335aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 273435aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 273535aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 273635aa4fcfSShri Abhyankar v += 49; 273735aa4fcfSShri Abhyankar } 273835aa4fcfSShri Abhyankar idc = 7*c[i]; 273935aa4fcfSShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 274035aa4fcfSShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 274135aa4fcfSShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 274235aa4fcfSShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 274335aa4fcfSShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 274435aa4fcfSShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 274535aa4fcfSShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 274635aa4fcfSShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 274735aa4fcfSShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 274835aa4fcfSShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 274935aa4fcfSShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 275035aa4fcfSShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 275135aa4fcfSShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 275235aa4fcfSShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 275335aa4fcfSShri Abhyankar } 275435aa4fcfSShri Abhyankar 275535aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 275635aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 27573649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 275835aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 275935aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 276035aa4fcfSShri Abhyankar PetscFunctionReturn(0); 276135aa4fcfSShri Abhyankar } 276235aa4fcfSShri Abhyankar 276335aa4fcfSShri Abhyankar #undef __FUNCT__ 276406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 276506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 276615091d37SBarry Smith { 276715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2768b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2769dfbe8321SBarry Smith PetscErrorCode ierr; 2770b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 2771d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2772d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2773d9fead3dSBarry Smith const PetscScalar *b; 277415091d37SBarry Smith 277515091d37SBarry Smith PetscFunctionBegin; 27763649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 27771ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 277815091d37SBarry Smith /* forward solve the lower triangular */ 277915091d37SBarry Smith idx = 0; 278015091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 278115091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 278215091d37SBarry Smith x[6] = b[6+idx]; 278315091d37SBarry Smith for (i=1; i<n; i++) { 278415091d37SBarry Smith v = aa + 49*ai[i]; 278515091d37SBarry Smith vi = aj + ai[i]; 278615091d37SBarry Smith nz = diag[i] - ai[i]; 278715091d37SBarry Smith idx = 7*i; 2788f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2789f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2790f1af5d2fSBarry Smith s7 = b[6+idx]; 279115091d37SBarry Smith while (nz--) { 279215091d37SBarry Smith jdx = 7*(*vi++); 279315091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 279415091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 279515091d37SBarry Smith x7 = x[6+jdx]; 2796f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2797f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2798f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2799f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2800f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2801f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2802f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 280315091d37SBarry Smith v += 49; 280415091d37SBarry Smith } 2805f1af5d2fSBarry Smith x[idx] = s1; 2806f1af5d2fSBarry Smith x[1+idx] = s2; 2807f1af5d2fSBarry Smith x[2+idx] = s3; 2808f1af5d2fSBarry Smith x[3+idx] = s4; 2809f1af5d2fSBarry Smith x[4+idx] = s5; 2810f1af5d2fSBarry Smith x[5+idx] = s6; 2811f1af5d2fSBarry Smith x[6+idx] = s7; 281215091d37SBarry Smith } 281315091d37SBarry Smith /* backward solve the upper triangular */ 281415091d37SBarry Smith for (i=n-1; i>=0; i--) { 281515091d37SBarry Smith v = aa + 49*diag[i] + 49; 281615091d37SBarry Smith vi = aj + diag[i] + 1; 281715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 281815091d37SBarry Smith idt = 7*i; 2819f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2820f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 2821f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 2822f1af5d2fSBarry Smith s7 = x[6+idt]; 282315091d37SBarry Smith while (nz--) { 282415091d37SBarry Smith idx = 7*(*vi++); 282515091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 282615091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 282715091d37SBarry Smith x7 = x[6+idx]; 2828f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2829f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2830f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2831f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2832f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2833f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2834f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 283515091d37SBarry Smith v += 49; 283615091d37SBarry Smith } 283715091d37SBarry Smith v = aa + 49*diag[i]; 2838f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2839f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 2840f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2841f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 2842f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2843f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 2844f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2845f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 2846f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2847f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 2848f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2849f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 2850f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2851f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 285215091d37SBarry Smith } 285315091d37SBarry Smith 28543649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 28551ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2856dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 285715091d37SBarry Smith PetscFunctionReturn(0); 285815091d37SBarry Smith } 285915091d37SBarry Smith 2860cee9d6f2SShri Abhyankar #undef __FUNCT__ 28614dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 28624dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 286353cca76cSShri Abhyankar { 286453cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2865b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 286653cca76cSShri Abhyankar PetscErrorCode ierr; 2867b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 2868b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 286953cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 287053cca76cSShri Abhyankar PetscScalar *x; 287153cca76cSShri Abhyankar const PetscScalar *b; 287253cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 287353cca76cSShri Abhyankar 287453cca76cSShri Abhyankar PetscFunctionBegin; 28753649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 287653cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 287753cca76cSShri Abhyankar /* forward solve the lower triangular */ 287853cca76cSShri Abhyankar idx = 0; 287953cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 288053cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 288153cca76cSShri Abhyankar for (i=1; i<n; i++) { 288253cca76cSShri Abhyankar v = aa + bs2*ai[i]; 288353cca76cSShri Abhyankar vi = aj + ai[i]; 288453cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 288553cca76cSShri Abhyankar idx = bs*i; 288653cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 288753cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 288853cca76cSShri Abhyankar for (k=0;k<nz;k++) { 288953cca76cSShri Abhyankar jdx = bs*vi[k]; 289053cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 289153cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 289253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 289353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 289453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 289553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 289653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 289753cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 289853cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 289953cca76cSShri Abhyankar v += bs2; 290053cca76cSShri Abhyankar } 290153cca76cSShri Abhyankar 290253cca76cSShri Abhyankar x[idx] = s1; 290353cca76cSShri Abhyankar x[1+idx] = s2; 290453cca76cSShri Abhyankar x[2+idx] = s3; 290553cca76cSShri Abhyankar x[3+idx] = s4; 290653cca76cSShri Abhyankar x[4+idx] = s5; 290753cca76cSShri Abhyankar x[5+idx] = s6; 290853cca76cSShri Abhyankar x[6+idx] = s7; 290953cca76cSShri Abhyankar } 291053cca76cSShri Abhyankar 291153cca76cSShri Abhyankar /* backward solve the upper triangular */ 291253cca76cSShri Abhyankar for (i=n-1; i>=0; i--) { 291353cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 291453cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 291553cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 291653cca76cSShri Abhyankar idt = bs*i; 291753cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 291853cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 291953cca76cSShri Abhyankar for (k=0;k<nz;k++) { 292053cca76cSShri Abhyankar idx = bs*vi[k]; 292153cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 292253cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 292353cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 292453cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 292553cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 292653cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 292753cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 292853cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 292953cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 293053cca76cSShri Abhyankar v += bs2; 293153cca76cSShri Abhyankar } 293253cca76cSShri Abhyankar /* x = inv_diagonal*x */ 293353cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 293453cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 293553cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 293653cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 293753cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 293853cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 293953cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 294053cca76cSShri Abhyankar } 294153cca76cSShri Abhyankar 29423649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 294353cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 294453cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 294553cca76cSShri Abhyankar PetscFunctionReturn(0); 294653cca76cSShri Abhyankar } 294753cca76cSShri Abhyankar 294853cca76cSShri Abhyankar #undef __FUNCT__ 294906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 295006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 295115091d37SBarry Smith { 295215091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 295315091d37SBarry Smith IS iscol=a->col,isrow=a->row; 29546849ba73SBarry Smith PetscErrorCode ierr; 29555d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 2956b3260449SShri Abhyankar const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2957b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 2958d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2959d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2960d9fead3dSBarry Smith const PetscScalar *b; 2961b3260449SShri Abhyankar 296215091d37SBarry Smith PetscFunctionBegin; 29633649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 29641ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2965f1af5d2fSBarry Smith t = a->solve_work; 296615091d37SBarry Smith 296715091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 296815091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 296915091d37SBarry Smith 297015091d37SBarry Smith /* forward solve the lower triangular */ 297115091d37SBarry Smith idx = 6*(*r++); 2972f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2973f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 2974f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 297515091d37SBarry Smith for (i=1; i<n; i++) { 297615091d37SBarry Smith v = aa + 36*ai[i]; 297715091d37SBarry Smith vi = aj + ai[i]; 297815091d37SBarry Smith nz = diag[i] - ai[i]; 297915091d37SBarry Smith idx = 6*(*r++); 2980f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2981f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 298215091d37SBarry Smith while (nz--) { 298315091d37SBarry Smith idx = 6*(*vi++); 2984f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2985f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2986f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2987f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2988f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2989f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2990f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2991f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 299215091d37SBarry Smith v += 36; 299315091d37SBarry Smith } 299415091d37SBarry Smith idx = 6*i; 2995f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2996f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 2997f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 299815091d37SBarry Smith } 299915091d37SBarry Smith /* backward solve the upper triangular */ 300015091d37SBarry Smith for (i=n-1; i>=0; i--) { 300115091d37SBarry Smith v = aa + 36*diag[i] + 36; 300215091d37SBarry Smith vi = aj + diag[i] + 1; 300315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 300415091d37SBarry Smith idt = 6*i; 3005f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3006f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 3007f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 300815091d37SBarry Smith while (nz--) { 300915091d37SBarry Smith idx = 6*(*vi++); 3010f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3011f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 3012f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 3013f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3014f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3015f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3016f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3017f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3018f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 301915091d37SBarry Smith v += 36; 302015091d37SBarry Smith } 302115091d37SBarry Smith idc = 6*(*c--); 302215091d37SBarry Smith v = aa + 36*diag[i]; 3023f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3024f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 3025f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3026f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 3027f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3028f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 3029f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3030f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 3031f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3032f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 3033f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3034f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 303515091d37SBarry Smith } 303615091d37SBarry Smith 303715091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 303815091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 30393649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 30401ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3041dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 304215091d37SBarry Smith PetscFunctionReturn(0); 304315091d37SBarry Smith } 304415091d37SBarry Smith 30456506fda5SShri Abhyankar #undef __FUNCT__ 30464dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6" 30474dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 30486506fda5SShri Abhyankar { 30496506fda5SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 30506506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 30516506fda5SShri Abhyankar PetscErrorCode ierr; 30526506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 3053b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3054b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 30556506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 30566506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 30576506fda5SShri Abhyankar const PetscScalar *b; 3058b3260449SShri Abhyankar 30596506fda5SShri Abhyankar PetscFunctionBegin; 30603649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 30616506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 30626506fda5SShri Abhyankar t = a->solve_work; 30636506fda5SShri Abhyankar 30646506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 30656506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 30666506fda5SShri Abhyankar 30676506fda5SShri Abhyankar /* forward solve the lower triangular */ 30686506fda5SShri Abhyankar idx = 6*r[0]; 30696506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 30706506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 30716506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 30726506fda5SShri Abhyankar for (i=1; i<n; i++) { 30736506fda5SShri Abhyankar v = aa + 36*ai[i]; 30746506fda5SShri Abhyankar vi = aj + ai[i]; 30756506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 30766506fda5SShri Abhyankar idx = 6*r[i]; 30776506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 30786506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 30796506fda5SShri Abhyankar for (m=0;m<nz;m++) { 30806506fda5SShri Abhyankar idx = 6*vi[m]; 30816506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 30826506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 30836506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 30846506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 30856506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 30866506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 30876506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 30886506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 30896506fda5SShri Abhyankar v += 36; 30906506fda5SShri Abhyankar } 30916506fda5SShri Abhyankar idx = 6*i; 30926506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 30936506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 30946506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 30956506fda5SShri Abhyankar } 30966506fda5SShri Abhyankar /* backward solve the upper triangular */ 30976506fda5SShri Abhyankar for (i=n-1; i>=0; i--) { 30986506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 30996506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 31006506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 31016506fda5SShri Abhyankar idt = 6*i; 31026506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 31036506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 31046506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 31056506fda5SShri Abhyankar for (m=0;m<nz;m++) { 31066506fda5SShri Abhyankar idx = 6*vi[m]; 31076506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 31086506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 31096506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 31106506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 31116506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 31126506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 31136506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 31146506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 31156506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 31166506fda5SShri Abhyankar v += 36; 31176506fda5SShri Abhyankar } 31186506fda5SShri Abhyankar idc = 6*c[i]; 31196506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 31206506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 31216506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 31226506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 31236506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 31246506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 31256506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 31266506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 31276506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 31286506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 31296506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 31306506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 31316506fda5SShri Abhyankar } 31326506fda5SShri Abhyankar 31336506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 31346506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 31353649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 31366506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 31376506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 31386506fda5SShri Abhyankar PetscFunctionReturn(0); 31396506fda5SShri Abhyankar } 31408f690400SShri Abhyankar 31418f690400SShri Abhyankar #undef __FUNCT__ 314206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 314306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 314415091d37SBarry Smith { 314515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3146b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 3147dfbe8321SBarry Smith PetscErrorCode ierr; 3148b3260449SShri Abhyankar const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 3149d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3150d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3151d9fead3dSBarry Smith const PetscScalar *b; 315215091d37SBarry Smith 315315091d37SBarry Smith PetscFunctionBegin; 31543649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 31551ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 315615091d37SBarry Smith /* forward solve the lower triangular */ 315715091d37SBarry Smith idx = 0; 315815091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 315915091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 316015091d37SBarry Smith for (i=1; i<n; i++) { 316115091d37SBarry Smith v = aa + 36*ai[i]; 316215091d37SBarry Smith vi = aj + ai[i]; 316315091d37SBarry Smith nz = diag[i] - ai[i]; 316415091d37SBarry Smith idx = 6*i; 3165f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3166f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 316715091d37SBarry Smith while (nz--) { 316815091d37SBarry Smith jdx = 6*(*vi++); 316915091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 317015091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 3171f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3172f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3173f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3174f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3175f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3176f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 317715091d37SBarry Smith v += 36; 317815091d37SBarry Smith } 3179f1af5d2fSBarry Smith x[idx] = s1; 3180f1af5d2fSBarry Smith x[1+idx] = s2; 3181f1af5d2fSBarry Smith x[2+idx] = s3; 3182f1af5d2fSBarry Smith x[3+idx] = s4; 3183f1af5d2fSBarry Smith x[4+idx] = s5; 3184f1af5d2fSBarry Smith x[5+idx] = s6; 318515091d37SBarry Smith } 318615091d37SBarry Smith /* backward solve the upper triangular */ 318715091d37SBarry Smith for (i=n-1; i>=0; i--) { 318815091d37SBarry Smith v = aa + 36*diag[i] + 36; 318915091d37SBarry Smith vi = aj + diag[i] + 1; 319015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 319115091d37SBarry Smith idt = 6*i; 3192f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3193f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 3194f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 319515091d37SBarry Smith while (nz--) { 319615091d37SBarry Smith idx = 6*(*vi++); 319715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 319815091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3199f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3200f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3201f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3202f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3203f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3204f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 320515091d37SBarry Smith v += 36; 320615091d37SBarry Smith } 320715091d37SBarry Smith v = aa + 36*diag[i]; 3208f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3209f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3210f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3211f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3212f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3213f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 321415091d37SBarry Smith } 321515091d37SBarry Smith 32163649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 32171ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3218dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 321915091d37SBarry Smith PetscFunctionReturn(0); 322015091d37SBarry Smith } 322115091d37SBarry Smith 3222cee9d6f2SShri Abhyankar #undef __FUNCT__ 32234dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 32244dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 322553cca76cSShri Abhyankar { 322653cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3227b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 322853cca76cSShri Abhyankar PetscErrorCode ierr; 3229b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 3230b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 323153cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 323253cca76cSShri Abhyankar PetscScalar *x; 323353cca76cSShri Abhyankar const PetscScalar *b; 323453cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 323553cca76cSShri Abhyankar 323653cca76cSShri Abhyankar PetscFunctionBegin; 32373649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 323853cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 323953cca76cSShri Abhyankar /* forward solve the lower triangular */ 324053cca76cSShri Abhyankar idx = 0; 324153cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 324253cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 324353cca76cSShri Abhyankar for (i=1; i<n; i++) { 324453cca76cSShri Abhyankar v = aa + bs2*ai[i]; 324553cca76cSShri Abhyankar vi = aj + ai[i]; 324653cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 324753cca76cSShri Abhyankar idx = bs*i; 324853cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 324953cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 325053cca76cSShri Abhyankar for (k=0;k<nz;k++) { 325153cca76cSShri Abhyankar jdx = bs*vi[k]; 325253cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 325353cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 325453cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 325553cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 325653cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 325753cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 325853cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 325953cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 326053cca76cSShri Abhyankar v += bs2; 326153cca76cSShri Abhyankar } 326253cca76cSShri Abhyankar 326353cca76cSShri Abhyankar x[idx] = s1; 326453cca76cSShri Abhyankar x[1+idx] = s2; 326553cca76cSShri Abhyankar x[2+idx] = s3; 326653cca76cSShri Abhyankar x[3+idx] = s4; 326753cca76cSShri Abhyankar x[4+idx] = s5; 326853cca76cSShri Abhyankar x[5+idx] = s6; 326953cca76cSShri Abhyankar } 327053cca76cSShri Abhyankar 327153cca76cSShri Abhyankar /* backward solve the upper triangular */ 327253cca76cSShri Abhyankar for (i=n-1; i>=0; i--) { 327353cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 327453cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 327553cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 327653cca76cSShri Abhyankar idt = bs*i; 327753cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 327853cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 327953cca76cSShri Abhyankar for (k=0;k<nz;k++) { 328053cca76cSShri Abhyankar idx = bs*vi[k]; 328153cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 328253cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 328353cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 328453cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 328553cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 328653cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 328753cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 328853cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 328953cca76cSShri Abhyankar v += bs2; 329053cca76cSShri Abhyankar } 329153cca76cSShri Abhyankar /* x = inv_diagonal*x */ 329253cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 329353cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 329453cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 329553cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 329653cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 329753cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 329853cca76cSShri Abhyankar } 329953cca76cSShri Abhyankar 33003649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 330153cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 330253cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 330353cca76cSShri Abhyankar PetscFunctionReturn(0); 330453cca76cSShri Abhyankar } 330553cca76cSShri Abhyankar 330653cca76cSShri Abhyankar #undef __FUNCT__ 330706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 330806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 33094e2b4712SSatish Balay { 33104e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 33114e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 33126849ba73SBarry Smith PetscErrorCode ierr; 33135d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3314b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3315b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 3316d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3317d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3318d9fead3dSBarry Smith const PetscScalar *b; 33194e2b4712SSatish Balay 33204e2b4712SSatish Balay PetscFunctionBegin; 33213649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 33221ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3323f1af5d2fSBarry Smith t = a->solve_work; 33244e2b4712SSatish Balay 33254e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 33264e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 33274e2b4712SSatish Balay 33284e2b4712SSatish Balay /* forward solve the lower triangular */ 33294e2b4712SSatish Balay idx = 5*(*r++); 3330f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3331f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 33324e2b4712SSatish Balay for (i=1; i<n; i++) { 33334e2b4712SSatish Balay v = aa + 25*ai[i]; 33344e2b4712SSatish Balay vi = aj + ai[i]; 33354e2b4712SSatish Balay nz = diag[i] - ai[i]; 33364e2b4712SSatish Balay idx = 5*(*r++); 3337f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3338f1af5d2fSBarry Smith s5 = b[4+idx]; 33394e2b4712SSatish Balay while (nz--) { 33404e2b4712SSatish Balay idx = 5*(*vi++); 3341f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3342f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 3343f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3344f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3345f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3346f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3347f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 33484e2b4712SSatish Balay v += 25; 33494e2b4712SSatish Balay } 33504e2b4712SSatish Balay idx = 5*i; 3351f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3352f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 33534e2b4712SSatish Balay } 33544e2b4712SSatish Balay /* backward solve the upper triangular */ 33554e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 33564e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 33574e2b4712SSatish Balay vi = aj + diag[i] + 1; 33584e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 33594e2b4712SSatish Balay idt = 5*i; 3360f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3361f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 33624e2b4712SSatish Balay while (nz--) { 33634e2b4712SSatish Balay idx = 5*(*vi++); 3364f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3365f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3366f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3367f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3368f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3369f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3370f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 33714e2b4712SSatish Balay v += 25; 33724e2b4712SSatish Balay } 33734e2b4712SSatish Balay idc = 5*(*c--); 33744e2b4712SSatish Balay v = aa + 25*diag[i]; 3375f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3376f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 3377f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3378f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 3379f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3380f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 3381f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3382f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 3383f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3384f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 33854e2b4712SSatish Balay } 33864e2b4712SSatish Balay 33874e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 33884e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 33893649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 33901ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3391dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 33924e2b4712SSatish Balay PetscFunctionReturn(0); 33934e2b4712SSatish Balay } 33944e2b4712SSatish Balay 339578bb4007SShri Abhyankar #undef __FUNCT__ 33964dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5" 33974dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 339878bb4007SShri Abhyankar { 339978bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 340078bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 340178bb4007SShri Abhyankar PetscErrorCode ierr; 340278bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 3403b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3404b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 340578bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 340678bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 340778bb4007SShri Abhyankar const PetscScalar *b; 340878bb4007SShri Abhyankar 340978bb4007SShri Abhyankar PetscFunctionBegin; 34103649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 341178bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 341278bb4007SShri Abhyankar t = a->solve_work; 341378bb4007SShri Abhyankar 341478bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 341578bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 341678bb4007SShri Abhyankar 341778bb4007SShri Abhyankar /* forward solve the lower triangular */ 341878bb4007SShri Abhyankar idx = 5*r[0]; 341978bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 342078bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 342178bb4007SShri Abhyankar for (i=1; i<n; i++) { 342278bb4007SShri Abhyankar v = aa + 25*ai[i]; 342378bb4007SShri Abhyankar vi = aj + ai[i]; 342478bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 342578bb4007SShri Abhyankar idx = 5*r[i]; 342678bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 342778bb4007SShri Abhyankar s5 = b[4+idx]; 342878bb4007SShri Abhyankar for (m=0;m<nz;m++) { 342978bb4007SShri Abhyankar idx = 5*vi[m]; 343078bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 343178bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 343278bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 343378bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 343478bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 343578bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 343678bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 343778bb4007SShri Abhyankar v += 25; 343878bb4007SShri Abhyankar } 343978bb4007SShri Abhyankar idx = 5*i; 344078bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 344178bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 344278bb4007SShri Abhyankar } 344378bb4007SShri Abhyankar /* backward solve the upper triangular */ 344478bb4007SShri Abhyankar for (i=n-1; i>=0; i--) { 344578bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 344678bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 344778bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 344878bb4007SShri Abhyankar idt = 5*i; 344978bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 345078bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 345178bb4007SShri Abhyankar for (m=0;m<nz;m++) { 345278bb4007SShri Abhyankar idx = 5*vi[m]; 345378bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 345478bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 345578bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 345678bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 345778bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 345878bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 345978bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 346078bb4007SShri Abhyankar v += 25; 346178bb4007SShri Abhyankar } 346278bb4007SShri Abhyankar idc = 5*c[i]; 346378bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 346478bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 346578bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 346678bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 346778bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 346878bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 346978bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 347078bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 347178bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 347278bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 347378bb4007SShri Abhyankar } 347478bb4007SShri Abhyankar 347578bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 347678bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 34773649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 347878bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 347978bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 348078bb4007SShri Abhyankar PetscFunctionReturn(0); 348178bb4007SShri Abhyankar } 348278bb4007SShri Abhyankar 34838f690400SShri Abhyankar #undef __FUNCT__ 348406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 348506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 348615091d37SBarry Smith { 348715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3488b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3489b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 3490dfbe8321SBarry Smith PetscErrorCode ierr; 3491d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3492d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3493d9fead3dSBarry Smith const PetscScalar *b; 349415091d37SBarry Smith 349515091d37SBarry Smith PetscFunctionBegin; 34963649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 34971ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 349815091d37SBarry Smith /* forward solve the lower triangular */ 349915091d37SBarry Smith idx = 0; 350015091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 350115091d37SBarry Smith for (i=1; i<n; i++) { 350215091d37SBarry Smith v = aa + 25*ai[i]; 350315091d37SBarry Smith vi = aj + ai[i]; 350415091d37SBarry Smith nz = diag[i] - ai[i]; 350515091d37SBarry Smith idx = 5*i; 3506f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 350715091d37SBarry Smith while (nz--) { 350815091d37SBarry Smith jdx = 5*(*vi++); 350915091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3510f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3511f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3512f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3513f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3514f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 351515091d37SBarry Smith v += 25; 351615091d37SBarry Smith } 3517f1af5d2fSBarry Smith x[idx] = s1; 3518f1af5d2fSBarry Smith x[1+idx] = s2; 3519f1af5d2fSBarry Smith x[2+idx] = s3; 3520f1af5d2fSBarry Smith x[3+idx] = s4; 3521f1af5d2fSBarry Smith x[4+idx] = s5; 352215091d37SBarry Smith } 352315091d37SBarry Smith /* backward solve the upper triangular */ 352415091d37SBarry Smith for (i=n-1; i>=0; i--) { 352515091d37SBarry Smith v = aa + 25*diag[i] + 25; 352615091d37SBarry Smith vi = aj + diag[i] + 1; 352715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 352815091d37SBarry Smith idt = 5*i; 3529f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3530f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 353115091d37SBarry Smith while (nz--) { 353215091d37SBarry Smith idx = 5*(*vi++); 353315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3534f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3535f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3536f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3537f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3538f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 353915091d37SBarry Smith v += 25; 354015091d37SBarry Smith } 354115091d37SBarry Smith v = aa + 25*diag[i]; 3542f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3543f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3544f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3545f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3546f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 354715091d37SBarry Smith } 354815091d37SBarry Smith 35493649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 35501ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3551dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 355215091d37SBarry Smith PetscFunctionReturn(0); 355315091d37SBarry Smith } 355415091d37SBarry Smith 3555cee9d6f2SShri Abhyankar #undef __FUNCT__ 35564dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 35574dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 355853cca76cSShri Abhyankar { 355953cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3560b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3561b3260449SShri Abhyankar PetscInt i,k,nz,idx,idt,jdx; 356253cca76cSShri Abhyankar PetscErrorCode ierr; 356353cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 356453cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 356553cca76cSShri Abhyankar const PetscScalar *b; 356653cca76cSShri Abhyankar 356753cca76cSShri Abhyankar PetscFunctionBegin; 35683649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 356953cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 357053cca76cSShri Abhyankar /* forward solve the lower triangular */ 357153cca76cSShri Abhyankar idx = 0; 357253cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 357353cca76cSShri Abhyankar for (i=1; i<n; i++) { 357453cca76cSShri Abhyankar v = aa + 25*ai[i]; 357553cca76cSShri Abhyankar vi = aj + ai[i]; 357653cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 357753cca76cSShri Abhyankar idx = 5*i; 357853cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 357953cca76cSShri Abhyankar for (k=0;k<nz;k++) { 358053cca76cSShri Abhyankar jdx = 5*vi[k]; 358153cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 358253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 358353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 358453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 358553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 358653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 358753cca76cSShri Abhyankar v += 25; 358853cca76cSShri Abhyankar } 358953cca76cSShri Abhyankar x[idx] = s1; 359053cca76cSShri Abhyankar x[1+idx] = s2; 359153cca76cSShri Abhyankar x[2+idx] = s3; 359253cca76cSShri Abhyankar x[3+idx] = s4; 359353cca76cSShri Abhyankar x[4+idx] = s5; 359453cca76cSShri Abhyankar } 359553cca76cSShri Abhyankar 359653cca76cSShri Abhyankar /* backward solve the upper triangular */ 359753cca76cSShri Abhyankar for (i=n-1; i>=0; i--) { 359853cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 359953cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 360053cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 360153cca76cSShri Abhyankar idt = 5*i; 360253cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 360353cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 360453cca76cSShri Abhyankar for (k=0;k<nz;k++) { 360553cca76cSShri Abhyankar idx = 5*vi[k]; 360653cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 360753cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 360853cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 360953cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 361053cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 361153cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 361253cca76cSShri Abhyankar v += 25; 361353cca76cSShri Abhyankar } 361453cca76cSShri Abhyankar /* x = inv_diagonal*x */ 361553cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 361653cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 361753cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 361853cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 361953cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 362053cca76cSShri Abhyankar } 362153cca76cSShri Abhyankar 36223649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 362353cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 362453cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 362553cca76cSShri Abhyankar PetscFunctionReturn(0); 362653cca76cSShri Abhyankar } 362753cca76cSShri Abhyankar 362853cca76cSShri Abhyankar #undef __FUNCT__ 362906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 363006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 36314e2b4712SSatish Balay { 36324e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 36334e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 36346849ba73SBarry Smith PetscErrorCode ierr; 3635b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3636b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 36375d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3638d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3639d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3640d9fead3dSBarry Smith const PetscScalar *b; 36414e2b4712SSatish Balay 36424e2b4712SSatish Balay PetscFunctionBegin; 36433649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 36441ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3645f1af5d2fSBarry Smith t = a->solve_work; 36464e2b4712SSatish Balay 36474e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 36484e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 36494e2b4712SSatish Balay 36504e2b4712SSatish Balay /* forward solve the lower triangular */ 36514e2b4712SSatish Balay idx = 4*(*r++); 3652f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3653f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 36544e2b4712SSatish Balay for (i=1; i<n; i++) { 36554e2b4712SSatish Balay v = aa + 16*ai[i]; 36564e2b4712SSatish Balay vi = aj + ai[i]; 36574e2b4712SSatish Balay nz = diag[i] - ai[i]; 36584e2b4712SSatish Balay idx = 4*(*r++); 3659f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 36604e2b4712SSatish Balay while (nz--) { 36614e2b4712SSatish Balay idx = 4*(*vi++); 3662f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3663f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3664f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3665f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3666f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 36674e2b4712SSatish Balay v += 16; 36684e2b4712SSatish Balay } 36694e2b4712SSatish Balay idx = 4*i; 3670f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3671f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 36724e2b4712SSatish Balay } 36734e2b4712SSatish Balay /* backward solve the upper triangular */ 36744e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 36754e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 36764e2b4712SSatish Balay vi = aj + diag[i] + 1; 36774e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 36784e2b4712SSatish Balay idt = 4*i; 3679f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3680f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 36814e2b4712SSatish Balay while (nz--) { 36824e2b4712SSatish Balay idx = 4*(*vi++); 3683f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3684f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 3685f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3686f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3687f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3688f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 36894e2b4712SSatish Balay v += 16; 36904e2b4712SSatish Balay } 36914e2b4712SSatish Balay idc = 4*(*c--); 36924e2b4712SSatish Balay v = aa + 16*diag[i]; 3693f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3694f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3695f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3696f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 36974e2b4712SSatish Balay } 36984e2b4712SSatish Balay 36994e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 37004e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 37013649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 37021ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3703dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 37044e2b4712SSatish Balay PetscFunctionReturn(0); 37054e2b4712SSatish Balay } 3706f26ec98cSKris Buschelman 37078f690400SShri Abhyankar #undef __FUNCT__ 37084dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4" 37094dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 371078bb4007SShri Abhyankar { 371178bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 371278bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 371378bb4007SShri Abhyankar PetscErrorCode ierr; 3714b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3715b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 371678bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 371778bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 371878bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 371978bb4007SShri Abhyankar const PetscScalar *b; 372078bb4007SShri Abhyankar 372178bb4007SShri Abhyankar PetscFunctionBegin; 37223649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 372378bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 372478bb4007SShri Abhyankar t = a->solve_work; 372578bb4007SShri Abhyankar 372678bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 372778bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 372878bb4007SShri Abhyankar 372978bb4007SShri Abhyankar /* forward solve the lower triangular */ 373078bb4007SShri Abhyankar idx = 4*r[0]; 373178bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 373278bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 373378bb4007SShri Abhyankar for (i=1; i<n; i++) { 373478bb4007SShri Abhyankar v = aa + 16*ai[i]; 373578bb4007SShri Abhyankar vi = aj + ai[i]; 373678bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 373778bb4007SShri Abhyankar idx = 4*r[i]; 373878bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 373978bb4007SShri Abhyankar for (m=0;m<nz;m++) { 374078bb4007SShri Abhyankar idx = 4*vi[m]; 374178bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 374278bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 374378bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 374478bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 374578bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 374678bb4007SShri Abhyankar v += 16; 374778bb4007SShri Abhyankar } 374878bb4007SShri Abhyankar idx = 4*i; 374978bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 375078bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 375178bb4007SShri Abhyankar } 375278bb4007SShri Abhyankar /* backward solve the upper triangular */ 375378bb4007SShri Abhyankar for (i=n-1; i>=0; i--) { 375478bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 375578bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 375678bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 375778bb4007SShri Abhyankar idt = 4*i; 375878bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 375978bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 376078bb4007SShri Abhyankar for (m=0;m<nz;m++) { 376178bb4007SShri Abhyankar idx = 4*vi[m]; 376278bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 376378bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 376478bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 376578bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 376678bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 376778bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 376878bb4007SShri Abhyankar v += 16; 376978bb4007SShri Abhyankar } 377078bb4007SShri Abhyankar idc = 4*c[i]; 377178bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 377278bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 377378bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 377478bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 377578bb4007SShri Abhyankar } 377678bb4007SShri Abhyankar 377778bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 377878bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 37793649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 378078bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 378178bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 378278bb4007SShri Abhyankar PetscFunctionReturn(0); 378378bb4007SShri Abhyankar } 378478bb4007SShri Abhyankar 378578bb4007SShri Abhyankar #undef __FUNCT__ 3786f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3787dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3788f26ec98cSKris Buschelman { 3789f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3790f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 37916849ba73SBarry Smith PetscErrorCode ierr; 3792b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3793b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 37945d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3795d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3796d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3797d9fead3dSBarry Smith PetscScalar *x; 3798d9fead3dSBarry Smith const PetscScalar *b; 3799f26ec98cSKris Buschelman 3800f26ec98cSKris Buschelman PetscFunctionBegin; 38013649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 38021ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3803f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 3804f26ec98cSKris Buschelman 3805f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3806f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3807f26ec98cSKris Buschelman 3808f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3809f26ec98cSKris Buschelman idx = 4*(*r++); 3810f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 3811f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 3812f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 3813f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 3814f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3815f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3816f26ec98cSKris Buschelman vi = aj + ai[i]; 3817f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3818f26ec98cSKris Buschelman idx = 4*(*r++); 3819f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3820f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3821f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3822f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3823f26ec98cSKris Buschelman while (nz--) { 3824f26ec98cSKris Buschelman idx = 4*(*vi++); 3825f26ec98cSKris Buschelman x1 = t[idx]; 3826f26ec98cSKris Buschelman x2 = t[1+idx]; 3827f26ec98cSKris Buschelman x3 = t[2+idx]; 3828f26ec98cSKris Buschelman x4 = t[3+idx]; 3829f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3830f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3831f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3832f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3833f26ec98cSKris Buschelman v += 16; 3834f26ec98cSKris Buschelman } 3835f26ec98cSKris Buschelman idx = 4*i; 3836f26ec98cSKris Buschelman t[idx] = s1; 3837f26ec98cSKris Buschelman t[1+idx] = s2; 3838f26ec98cSKris Buschelman t[2+idx] = s3; 3839f26ec98cSKris Buschelman t[3+idx] = s4; 3840f26ec98cSKris Buschelman } 3841f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3842f26ec98cSKris Buschelman for (i=n-1; i>=0; i--) { 3843f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 3844f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3845f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3846f26ec98cSKris Buschelman idt = 4*i; 3847f26ec98cSKris Buschelman s1 = t[idt]; 3848f26ec98cSKris Buschelman s2 = t[1+idt]; 3849f26ec98cSKris Buschelman s3 = t[2+idt]; 3850f26ec98cSKris Buschelman s4 = t[3+idt]; 3851f26ec98cSKris Buschelman while (nz--) { 3852f26ec98cSKris Buschelman idx = 4*(*vi++); 3853f26ec98cSKris Buschelman x1 = t[idx]; 3854f26ec98cSKris Buschelman x2 = t[1+idx]; 3855f26ec98cSKris Buschelman x3 = t[2+idx]; 3856f26ec98cSKris Buschelman x4 = t[3+idx]; 3857f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3858f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3859f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3860f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3861f26ec98cSKris Buschelman v += 16; 3862f26ec98cSKris Buschelman } 3863f26ec98cSKris Buschelman idc = 4*(*c--); 3864f26ec98cSKris Buschelman v = aa + 16*diag[i]; 3865f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3866f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3867f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3868f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3869f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 3870f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 3871f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 3872f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 3873f26ec98cSKris Buschelman } 3874f26ec98cSKris Buschelman 3875f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3876f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 38773649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 38781ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3879dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3880f26ec98cSKris Buschelman PetscFunctionReturn(0); 3881f26ec98cSKris Buschelman } 3882f26ec98cSKris Buschelman 388324c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 388424c233c2SKris Buschelman 388524c233c2SKris Buschelman #include PETSC_HAVE_SSE 388624c233c2SKris Buschelman 388724c233c2SKris Buschelman #undef __FUNCT__ 388824c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3889dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 389024c233c2SKris Buschelman { 389124c233c2SKris Buschelman /* 389224c233c2SKris Buschelman Note: This code uses demotion of double 389324c233c2SKris Buschelman to float when performing the mixed-mode computation. 389424c233c2SKris Buschelman This may not be numerically reasonable for all applications. 389524c233c2SKris Buschelman */ 389624c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 389724c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 38986849ba73SBarry Smith PetscErrorCode ierr; 38995d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 39005d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 390124c233c2SKris Buschelman MatScalar *aa=a->a,*v; 390287828ca2SBarry Smith PetscScalar *x,*b,*t; 390324c233c2SKris Buschelman 390424c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 390524c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 390624c233c2SKris Buschelman unsigned long offset; 390724c233c2SKris Buschelman 390824c233c2SKris Buschelman PetscFunctionBegin; 390924c233c2SKris Buschelman SSE_SCOPE_BEGIN; 391024c233c2SKris Buschelman 391124c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 391224c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 391324c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 391424c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 391524c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 391624c233c2SKris Buschelman 39171ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 39181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 391924c233c2SKris Buschelman t = a->solve_work; 392024c233c2SKris Buschelman 392124c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 392224c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 392324c233c2SKris Buschelman 392424c233c2SKris Buschelman /* forward solve the lower triangular */ 392524c233c2SKris Buschelman idx = 4*(*r++); 392624c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 392724c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 392824c233c2SKris Buschelman v = aa + 16*ai[1]; 392924c233c2SKris Buschelman 393024c233c2SKris Buschelman for (i=1; i<n;) { 393124c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 393224c233c2SKris Buschelman vi = aj + ai[i]; 393324c233c2SKris Buschelman nz = diag[i] - ai[i]; 393424c233c2SKris Buschelman idx = 4*(*r++); 393524c233c2SKris Buschelman 393624c233c2SKris Buschelman /* Demote sum from double to float */ 393724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 393824c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 393924c233c2SKris Buschelman 394024c233c2SKris Buschelman while (nz--) { 394124c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 394224c233c2SKris Buschelman idx = 4*(*vi++); 394324c233c2SKris Buschelman 394424c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 394524c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 394624c233c2SKris Buschelman 394724c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 394824c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 394924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 395024c233c2SKris Buschelman 395124c233c2SKris Buschelman /* First Column */ 395224c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 395324c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 395424c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 395524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 395624c233c2SKris Buschelman 395724c233c2SKris Buschelman /* Second Column */ 395824c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 395924c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 396024c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 396124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 396224c233c2SKris Buschelman 396324c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 396424c233c2SKris Buschelman 396524c233c2SKris Buschelman /* Third Column */ 396624c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 396724c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 396824c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 396924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 397024c233c2SKris Buschelman 397124c233c2SKris Buschelman /* Fourth Column */ 397224c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 397324c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 397424c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 397524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 397624c233c2SKris Buschelman SSE_INLINE_END_2 397724c233c2SKris Buschelman 397824c233c2SKris Buschelman v += 16; 397924c233c2SKris Buschelman } 398024c233c2SKris Buschelman idx = 4*i; 398124c233c2SKris Buschelman v = aa + 16*ai[++i]; 398224c233c2SKris Buschelman PREFETCH_NTA(v); 398324c233c2SKris Buschelman STORE_PS(tmps,XMM7); 398424c233c2SKris Buschelman 398524c233c2SKris Buschelman /* Promote result from float to double */ 398624c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 398724c233c2SKris Buschelman } 398824c233c2SKris Buschelman /* backward solve the upper triangular */ 398924c233c2SKris Buschelman idt = 4*(n-1); 399024c233c2SKris Buschelman ai16 = 16*diag[n-1]; 399124c233c2SKris Buschelman v = aa + ai16 + 16; 399224c233c2SKris Buschelman for (i=n-1; i>=0;) { 399324c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 399424c233c2SKris Buschelman vi = aj + diag[i] + 1; 399524c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 399624c233c2SKris Buschelman 399724c233c2SKris Buschelman /* Demote accumulator from double to float */ 399824c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 399924c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 400024c233c2SKris Buschelman 400124c233c2SKris Buschelman while (nz--) { 400224c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 400324c233c2SKris Buschelman idx = 4*(*vi++); 400424c233c2SKris Buschelman 400524c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 400624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 400724c233c2SKris Buschelman 400824c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 400924c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 401024c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 401124c233c2SKris Buschelman 401224c233c2SKris Buschelman /* First Column */ 401324c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 401424c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 401524c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 401624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 401724c233c2SKris Buschelman 401824c233c2SKris Buschelman /* Second Column */ 401924c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 402024c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 402124c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 402224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 402324c233c2SKris Buschelman 402424c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 402524c233c2SKris Buschelman 402624c233c2SKris Buschelman /* Third Column */ 402724c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 402824c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 402924c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 403024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 403124c233c2SKris Buschelman 403224c233c2SKris Buschelman /* Fourth Column */ 403324c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 403424c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 403524c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 403624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 403724c233c2SKris Buschelman SSE_INLINE_END_2 403824c233c2SKris Buschelman v += 16; 403924c233c2SKris Buschelman } 404024c233c2SKris Buschelman v = aa + ai16; 404124c233c2SKris Buschelman ai16 = 16*diag[--i]; 404224c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 404324c233c2SKris Buschelman /* 404424c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 404524c233c2SKris Buschelman which was inverted as part of the factorization 404624c233c2SKris Buschelman */ 404724c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 404824c233c2SKris Buschelman /* First Column */ 404924c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 405024c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 405124c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 405224c233c2SKris Buschelman 405324c233c2SKris Buschelman /* Second Column */ 405424c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 405524c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 405624c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 405724c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 405824c233c2SKris Buschelman 405924c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 406024c233c2SKris Buschelman 406124c233c2SKris Buschelman /* Third Column */ 406224c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 406324c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 406424c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 406524c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 406624c233c2SKris Buschelman 406724c233c2SKris Buschelman /* Fourth Column */ 406824c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 406924c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 407024c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 407124c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 407224c233c2SKris Buschelman 407324c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 407424c233c2SKris Buschelman SSE_INLINE_END_3 407524c233c2SKris Buschelman 407624c233c2SKris Buschelman /* Promote solution from float to double */ 407724c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 407824c233c2SKris Buschelman 407924c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 408024c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 408124c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 408224c233c2SKris Buschelman idc = 4*(*c--); 408324c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 408424c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 408524c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 408624c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 408724c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 408824c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 408924c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 409024c233c2SKris Buschelman SSE_INLINE_END_2 409124c233c2SKris Buschelman v = aa + ai16 + 16; 409224c233c2SKris Buschelman idt -= 4; 409324c233c2SKris Buschelman } 409424c233c2SKris Buschelman 409524c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 409624c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 40971ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 40981ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4099dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 410024c233c2SKris Buschelman SSE_SCOPE_END; 410124c233c2SKris Buschelman PetscFunctionReturn(0); 410224c233c2SKris Buschelman } 410324c233c2SKris Buschelman 410424c233c2SKris Buschelman #endif 41050ef38995SBarry Smith 41060ef38995SBarry Smith 41074e2b4712SSatish Balay /* 41084e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 41094e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 41104e2b4712SSatish Balay */ 41114a2ae208SSatish Balay #undef __FUNCT__ 411206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 411306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 41144e2b4712SSatish Balay { 41154e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4116356650c2SBarry Smith PetscInt n=a->mbs; 4117356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 4118dfbe8321SBarry Smith PetscErrorCode ierr; 4119356650c2SBarry Smith const PetscInt *diag = a->diag; 4120d9fead3dSBarry Smith const MatScalar *aa=a->a; 4121d9fead3dSBarry Smith PetscScalar *x; 4122d9fead3dSBarry Smith const PetscScalar *b; 41234e2b4712SSatish Balay 41244e2b4712SSatish Balay PetscFunctionBegin; 41253649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 41261ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 41274e2b4712SSatish Balay 4128aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 41292853dc0eSBarry Smith { 413087828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 41312853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 41322853dc0eSBarry Smith } 4133aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 41342853dc0eSBarry Smith { 413587828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 41362853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 41372853dc0eSBarry Smith } 4138aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 41392853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 4140e1293385SBarry Smith #else 414130d4dcafSBarry Smith { 414287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4143d9fead3dSBarry Smith const MatScalar *v; 4144356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 4145356650c2SBarry Smith const PetscInt *vi; 4146e1293385SBarry Smith 41474e2b4712SSatish Balay /* forward solve the lower triangular */ 41484e2b4712SSatish Balay idx = 0; 4149e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 41504e2b4712SSatish Balay for (i=1; i<n; i++) { 41514e2b4712SSatish Balay v = aa + 16*ai[i]; 41524e2b4712SSatish Balay vi = aj + ai[i]; 41534e2b4712SSatish Balay nz = diag[i] - ai[i]; 4154e1293385SBarry Smith idx += 4; 4155f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 41564e2b4712SSatish Balay while (nz--) { 41574e2b4712SSatish Balay jdx = 4*(*vi++); 41584e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 4159f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4160f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4161f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4162f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 41634e2b4712SSatish Balay v += 16; 41644e2b4712SSatish Balay } 4165f1af5d2fSBarry Smith x[idx] = s1; 4166f1af5d2fSBarry Smith x[1+idx] = s2; 4167f1af5d2fSBarry Smith x[2+idx] = s3; 4168f1af5d2fSBarry Smith x[3+idx] = s4; 41694e2b4712SSatish Balay } 41704e2b4712SSatish Balay /* backward solve the upper triangular */ 41714e555682SBarry Smith idt = 4*(n-1); 41724e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 41734e555682SBarry Smith ai16 = 16*diag[i]; 41744e555682SBarry Smith v = aa + ai16 + 16; 41754e2b4712SSatish Balay vi = aj + diag[i] + 1; 41764e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4177f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4178f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 41794e2b4712SSatish Balay while (nz--) { 41804e2b4712SSatish Balay idx = 4*(*vi++); 41814e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 4182f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4183f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4184f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4185f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 41864e2b4712SSatish Balay v += 16; 41874e2b4712SSatish Balay } 41884e555682SBarry Smith v = aa + ai16; 4189f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4190f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4191f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4192f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4193329f5518SBarry Smith idt -= 4; 41944e2b4712SSatish Balay } 419530d4dcafSBarry Smith } 4196e1293385SBarry Smith #endif 41974e2b4712SSatish Balay 41983649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 41991ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4200dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 42014e2b4712SSatish Balay PetscFunctionReturn(0); 42024e2b4712SSatish Balay } 42034e2b4712SSatish Balay 4204b2b2dd24SShri Abhyankar #undef __FUNCT__ 42054dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 42064dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4207b2b2dd24SShri Abhyankar { 4208b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4209b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4210b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 4211b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4212b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4213b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4214b2b2dd24SShri Abhyankar PetscScalar *x; 4215b2b2dd24SShri Abhyankar const PetscScalar *b; 4216b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4217cee9d6f2SShri Abhyankar 4218b2b2dd24SShri Abhyankar PetscFunctionBegin; 42193649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4220b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4221b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4222b2b2dd24SShri Abhyankar idx = 0; 4223b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4224b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4225b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4226b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4227b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4228b2b2dd24SShri Abhyankar idx = bs*i; 4229b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4230b2b2dd24SShri Abhyankar for (k=0;k<nz;k++) { 4231b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4232b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4233b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4234b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4235b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4236b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4237b2b2dd24SShri Abhyankar 4238b2b2dd24SShri Abhyankar v += bs2; 4239b2b2dd24SShri Abhyankar } 4240b2b2dd24SShri Abhyankar 4241b2b2dd24SShri Abhyankar x[idx] = s1; 4242b2b2dd24SShri Abhyankar x[1+idx] = s2; 4243b2b2dd24SShri Abhyankar x[2+idx] = s3; 4244b2b2dd24SShri Abhyankar x[3+idx] = s4; 4245b2b2dd24SShri Abhyankar } 4246b2b2dd24SShri Abhyankar 4247b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4248b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--) { 4249b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4250b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4251b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4252b2b2dd24SShri Abhyankar idt = bs*i; 4253b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4254b2b2dd24SShri Abhyankar 4255b2b2dd24SShri Abhyankar for (k=0;k<nz;k++) { 4256b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4257b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4258b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4259b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4260b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4261b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4262b2b2dd24SShri Abhyankar 4263b2b2dd24SShri Abhyankar v += bs2; 4264b2b2dd24SShri Abhyankar } 4265b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4266b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4267b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4268b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4269b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4270b2b2dd24SShri Abhyankar 4271b2b2dd24SShri Abhyankar } 4272b2b2dd24SShri Abhyankar 42733649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4274b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4275b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4276b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4277b2b2dd24SShri Abhyankar } 4278cee9d6f2SShri Abhyankar 4279cee9d6f2SShri Abhyankar #undef __FUNCT__ 4280f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4281dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4282f26ec98cSKris Buschelman { 4283f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4284b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 4285dfbe8321SBarry Smith PetscErrorCode ierr; 4286b3260449SShri Abhyankar const MatScalar *aa=a->a; 4287b3260449SShri Abhyankar const PetscScalar *b; 4288b3260449SShri Abhyankar PetscScalar *x; 4289f26ec98cSKris Buschelman 4290f26ec98cSKris Buschelman PetscFunctionBegin; 42913649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 42921ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4293f26ec98cSKris Buschelman 4294f26ec98cSKris Buschelman { 4295f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4296b3260449SShri Abhyankar const MatScalar *v; 4297b3260449SShri Abhyankar MatScalar *t=(MatScalar *)x; 4298b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i,ai16; 4299b3260449SShri Abhyankar const PetscInt *vi; 4300f26ec98cSKris Buschelman 4301f26ec98cSKris Buschelman /* forward solve the lower triangular */ 4302f26ec98cSKris Buschelman idx = 0; 4303f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 4304f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 4305f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 4306f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 4307f26ec98cSKris Buschelman for (i=1; i<n; i++) { 4308f26ec98cSKris Buschelman v = aa + 16*ai[i]; 4309f26ec98cSKris Buschelman vi = aj + ai[i]; 4310f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 4311f26ec98cSKris Buschelman idx += 4; 4312f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 4313f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 4314f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 4315f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 4316f26ec98cSKris Buschelman while (nz--) { 4317f26ec98cSKris Buschelman jdx = 4*(*vi++); 4318f26ec98cSKris Buschelman x1 = t[jdx]; 4319f26ec98cSKris Buschelman x2 = t[1+jdx]; 4320f26ec98cSKris Buschelman x3 = t[2+jdx]; 4321f26ec98cSKris Buschelman x4 = t[3+jdx]; 4322f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4323f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4324f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4325f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4326f26ec98cSKris Buschelman v += 16; 4327f26ec98cSKris Buschelman } 4328f26ec98cSKris Buschelman t[idx] = s1; 4329f26ec98cSKris Buschelman t[1+idx] = s2; 4330f26ec98cSKris Buschelman t[2+idx] = s3; 4331f26ec98cSKris Buschelman t[3+idx] = s4; 4332f26ec98cSKris Buschelman } 4333f26ec98cSKris Buschelman /* backward solve the upper triangular */ 4334f26ec98cSKris Buschelman idt = 4*(n-1); 4335f26ec98cSKris Buschelman for (i=n-1; i>=0; i--) { 4336f26ec98cSKris Buschelman ai16 = 16*diag[i]; 4337f26ec98cSKris Buschelman v = aa + ai16 + 16; 4338f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 4339f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 4340f26ec98cSKris Buschelman s1 = t[idt]; 4341f26ec98cSKris Buschelman s2 = t[1+idt]; 4342f26ec98cSKris Buschelman s3 = t[2+idt]; 4343f26ec98cSKris Buschelman s4 = t[3+idt]; 4344f26ec98cSKris Buschelman while (nz--) { 4345f26ec98cSKris Buschelman idx = 4*(*vi++); 4346f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 4347f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 4348f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 4349f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 4350f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4351f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4352f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4353f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4354f26ec98cSKris Buschelman v += 16; 4355f26ec98cSKris Buschelman } 4356f26ec98cSKris Buschelman v = aa + ai16; 4357f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4358f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4359f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4360f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4361f26ec98cSKris Buschelman idt -= 4; 4362f26ec98cSKris Buschelman } 4363f26ec98cSKris Buschelman } 4364f26ec98cSKris Buschelman 43653649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 43661ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4367dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4368f26ec98cSKris Buschelman PetscFunctionReturn(0); 4369f26ec98cSKris Buschelman } 4370f26ec98cSKris Buschelman 43713660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 43723660e330SKris Buschelman 43733660e330SKris Buschelman #include PETSC_HAVE_SSE 43743660e330SKris Buschelman #undef __FUNCT__ 43757cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4376dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 43773660e330SKris Buschelman { 43783660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 43792aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 4380dfbe8321SBarry Smith PetscErrorCode ierr; 4381dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 43823660e330SKris Buschelman MatScalar *aa=a->a; 438387828ca2SBarry Smith PetscScalar *x,*b; 43843660e330SKris Buschelman 43853660e330SKris Buschelman PetscFunctionBegin; 43863660e330SKris Buschelman SSE_SCOPE_BEGIN; 43873660e330SKris Buschelman /* 43883660e330SKris Buschelman Note: This code currently uses demotion of double 43893660e330SKris Buschelman to float when performing the mixed-mode computation. 43903660e330SKris Buschelman This may not be numerically reasonable for all applications. 43913660e330SKris Buschelman */ 43923660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 43933660e330SKris Buschelman 43941ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 43951ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 43963660e330SKris Buschelman { 4397eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 4398eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 43992aa5897fSKris Buschelman int nz,i,idt,ai16; 44002aa5897fSKris Buschelman unsigned int jdx,idx; 44012aa5897fSKris Buschelman unsigned short *vi; 4402eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 44033660e330SKris Buschelman 4404eb05f457SKris Buschelman /* First block is the identity. */ 44053660e330SKris Buschelman idx = 0; 4406eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 44072aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 44083660e330SKris Buschelman 44093660e330SKris Buschelman for (i=1; i<n;) { 44103660e330SKris Buschelman PREFETCH_NTA(&v[8]); 44113660e330SKris Buschelman vi = aj + ai[i]; 44123660e330SKris Buschelman nz = diag[i] - ai[i]; 44133660e330SKris Buschelman idx += 4; 44143660e330SKris Buschelman 4415eb05f457SKris Buschelman /* Demote RHS from double to float. */ 4416eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4417eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 44183660e330SKris Buschelman 44193660e330SKris Buschelman while (nz--) { 44203660e330SKris Buschelman PREFETCH_NTA(&v[16]); 44212aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 44223660e330SKris Buschelman 44233660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 4424eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 44253660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 44263660e330SKris Buschelman 44273660e330SKris Buschelman /* First Column */ 44283660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 44293660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 44303660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 44313660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 44323660e330SKris Buschelman 44333660e330SKris Buschelman /* Second Column */ 44343660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 44353660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 44363660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 44373660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 44383660e330SKris Buschelman 44393660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 44403660e330SKris Buschelman 44413660e330SKris Buschelman /* Third Column */ 44423660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 44433660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 44443660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 44453660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 44463660e330SKris Buschelman 44473660e330SKris Buschelman /* Fourth Column */ 44483660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 44493660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 44503660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 44513660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 44523660e330SKris Buschelman SSE_INLINE_END_2 44533660e330SKris Buschelman 44543660e330SKris Buschelman v += 16; 44553660e330SKris Buschelman } 44563660e330SKris Buschelman v = aa + 16*ai[++i]; 44573660e330SKris Buschelman PREFETCH_NTA(v); 4458eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 44593660e330SKris Buschelman } 4460eb05f457SKris Buschelman 4461eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 4462eb05f457SKris Buschelman 44633660e330SKris Buschelman idt = 4*(n-1); 44643660e330SKris Buschelman ai16 = 16*diag[n-1]; 44653660e330SKris Buschelman v = aa + ai16 + 16; 44663660e330SKris Buschelman for (i=n-1; i>=0;) { 44673660e330SKris Buschelman PREFETCH_NTA(&v[8]); 44683660e330SKris Buschelman vi = aj + diag[i] + 1; 44693660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 44703660e330SKris Buschelman 4471eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 44723660e330SKris Buschelman 44733660e330SKris Buschelman while (nz--) { 44743660e330SKris Buschelman PREFETCH_NTA(&v[16]); 44752aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 44763660e330SKris Buschelman 44773660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 4478eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 44793660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 44803660e330SKris Buschelman 44813660e330SKris Buschelman /* First Column */ 44823660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 44833660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 44843660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 44853660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 44863660e330SKris Buschelman 44873660e330SKris Buschelman /* Second Column */ 44883660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 44893660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 44903660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 44913660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 44923660e330SKris Buschelman 44933660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 44943660e330SKris Buschelman 44953660e330SKris Buschelman /* Third Column */ 44963660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 44973660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 44983660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 44993660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 45003660e330SKris Buschelman 45013660e330SKris Buschelman /* Fourth Column */ 45023660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 45033660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 45043660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 45053660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 45063660e330SKris Buschelman SSE_INLINE_END_2 45073660e330SKris Buschelman v += 16; 45083660e330SKris Buschelman } 45093660e330SKris Buschelman v = aa + ai16; 45103660e330SKris Buschelman ai16 = 16*diag[--i]; 45113660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 45123660e330SKris Buschelman /* 45133660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 45143660e330SKris Buschelman which was inverted as part of the factorization 45153660e330SKris Buschelman */ 4516eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 45173660e330SKris Buschelman /* First Column */ 45183660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 45193660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 45203660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 45213660e330SKris Buschelman 45223660e330SKris Buschelman /* Second Column */ 45233660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 45243660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 45253660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 45263660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 45273660e330SKris Buschelman 45283660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 45293660e330SKris Buschelman 45303660e330SKris Buschelman /* Third Column */ 45313660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 45323660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 45333660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 45343660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 45353660e330SKris Buschelman 45363660e330SKris Buschelman /* Fourth Column */ 45373660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 45383660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 45393660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 45403660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 45413660e330SKris Buschelman 45423660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 45433660e330SKris Buschelman SSE_INLINE_END_3 45443660e330SKris Buschelman 45453660e330SKris Buschelman v = aa + ai16 + 16; 45463660e330SKris Buschelman idt -= 4; 45473660e330SKris Buschelman } 4548eb05f457SKris Buschelman 4549eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 4550eb05f457SKris Buschelman idt = 4*(n-1); 4551eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 4552eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4553eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4554eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 4555eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 4556eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 4557eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 4558eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 4559eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 456054693613SKris Buschelman idt -= 4; 45613660e330SKris Buschelman } 4562eb05f457SKris Buschelman 4563eb05f457SKris Buschelman } /* End of artificial scope. */ 45641ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 45651ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4566dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 45673660e330SKris Buschelman SSE_SCOPE_END; 45683660e330SKris Buschelman PetscFunctionReturn(0); 45693660e330SKris Buschelman } 45703660e330SKris Buschelman 45717cf1b8d3SKris Buschelman #undef __FUNCT__ 45727cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4573dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 45747cf1b8d3SKris Buschelman { 45757cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 45767cf1b8d3SKris Buschelman int *aj=a->j; 4577dfbe8321SBarry Smith PetscErrorCode ierr; 4578dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 45797cf1b8d3SKris Buschelman MatScalar *aa=a->a; 45807cf1b8d3SKris Buschelman PetscScalar *x,*b; 45817cf1b8d3SKris Buschelman 45827cf1b8d3SKris Buschelman PetscFunctionBegin; 45837cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 45847cf1b8d3SKris Buschelman /* 45857cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 45867cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 45877cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 45887cf1b8d3SKris Buschelman */ 45897cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 45907cf1b8d3SKris Buschelman 45911ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 45921ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 45937cf1b8d3SKris Buschelman { 45947cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 45957cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 45967cf1b8d3SKris Buschelman int nz,i,idt,ai16; 45977cf1b8d3SKris Buschelman int jdx,idx; 45987cf1b8d3SKris Buschelman int *vi; 45997cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 46007cf1b8d3SKris Buschelman 46017cf1b8d3SKris Buschelman /* First block is the identity. */ 46027cf1b8d3SKris Buschelman idx = 0; 46037cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 46047cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 46057cf1b8d3SKris Buschelman 46067cf1b8d3SKris Buschelman for (i=1; i<n;) { 46077cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 46087cf1b8d3SKris Buschelman vi = aj + ai[i]; 46097cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 46107cf1b8d3SKris Buschelman idx += 4; 46117cf1b8d3SKris Buschelman 46127cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 46137cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 46147cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 46157cf1b8d3SKris Buschelman 46167cf1b8d3SKris Buschelman while (nz--) { 46177cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 46187cf1b8d3SKris Buschelman jdx = 4*(*vi++); 46197cf1b8d3SKris Buschelman /* jdx = *vi++; */ 46207cf1b8d3SKris Buschelman 46217cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 46227cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 46237cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 46247cf1b8d3SKris Buschelman 46257cf1b8d3SKris Buschelman /* First Column */ 46267cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 46277cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 46287cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 46297cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 46307cf1b8d3SKris Buschelman 46317cf1b8d3SKris Buschelman /* Second Column */ 46327cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 46337cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 46347cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 46357cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 46367cf1b8d3SKris Buschelman 46377cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 46387cf1b8d3SKris Buschelman 46397cf1b8d3SKris Buschelman /* Third Column */ 46407cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 46417cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 46427cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 46437cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 46447cf1b8d3SKris Buschelman 46457cf1b8d3SKris Buschelman /* Fourth Column */ 46467cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 46477cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 46487cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 46497cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 46507cf1b8d3SKris Buschelman SSE_INLINE_END_2 46517cf1b8d3SKris Buschelman 46527cf1b8d3SKris Buschelman v += 16; 46537cf1b8d3SKris Buschelman } 46547cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 46557cf1b8d3SKris Buschelman PREFETCH_NTA(v); 46567cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 46577cf1b8d3SKris Buschelman } 46587cf1b8d3SKris Buschelman 46597cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 46607cf1b8d3SKris Buschelman 46617cf1b8d3SKris Buschelman idt = 4*(n-1); 46627cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 46637cf1b8d3SKris Buschelman v = aa + ai16 + 16; 46647cf1b8d3SKris Buschelman for (i=n-1; i>=0;) { 46657cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 46667cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 46677cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 46687cf1b8d3SKris Buschelman 46697cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 46707cf1b8d3SKris Buschelman 46717cf1b8d3SKris Buschelman while (nz--) { 46727cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 46737cf1b8d3SKris Buschelman idx = 4*(*vi++); 46747cf1b8d3SKris Buschelman /* idx = *vi++; */ 46757cf1b8d3SKris Buschelman 46767cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 46777cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 46787cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 46797cf1b8d3SKris Buschelman 46807cf1b8d3SKris Buschelman /* First Column */ 46817cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 46827cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 46837cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 46847cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 46857cf1b8d3SKris Buschelman 46867cf1b8d3SKris Buschelman /* Second Column */ 46877cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 46887cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 46897cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 46907cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 46917cf1b8d3SKris Buschelman 46927cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 46937cf1b8d3SKris Buschelman 46947cf1b8d3SKris Buschelman /* Third Column */ 46957cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 46967cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 46977cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 46987cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 46997cf1b8d3SKris Buschelman 47007cf1b8d3SKris Buschelman /* Fourth Column */ 47017cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 47027cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 47037cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 47047cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 47057cf1b8d3SKris Buschelman SSE_INLINE_END_2 47067cf1b8d3SKris Buschelman v += 16; 47077cf1b8d3SKris Buschelman } 47087cf1b8d3SKris Buschelman v = aa + ai16; 47097cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 47107cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 47117cf1b8d3SKris Buschelman /* 47127cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 47137cf1b8d3SKris Buschelman which was inverted as part of the factorization 47147cf1b8d3SKris Buschelman */ 47157cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 47167cf1b8d3SKris Buschelman /* First Column */ 47177cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 47187cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 47197cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 47207cf1b8d3SKris Buschelman 47217cf1b8d3SKris Buschelman /* Second Column */ 47227cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 47237cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 47247cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 47257cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 47267cf1b8d3SKris Buschelman 47277cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 47287cf1b8d3SKris Buschelman 47297cf1b8d3SKris Buschelman /* Third Column */ 47307cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 47317cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 47327cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 47337cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 47347cf1b8d3SKris Buschelman 47357cf1b8d3SKris Buschelman /* Fourth Column */ 47367cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 47377cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 47387cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 47397cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 47407cf1b8d3SKris Buschelman 47417cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 47427cf1b8d3SKris Buschelman SSE_INLINE_END_3 47437cf1b8d3SKris Buschelman 47447cf1b8d3SKris Buschelman v = aa + ai16 + 16; 47457cf1b8d3SKris Buschelman idt -= 4; 47467cf1b8d3SKris Buschelman } 47477cf1b8d3SKris Buschelman 47487cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 47497cf1b8d3SKris Buschelman idt = 4*(n-1); 47507cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 47517cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 47527cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 47537cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 47547cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 47557cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 47567cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 47577cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 47587cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 47597cf1b8d3SKris Buschelman idt -= 4; 47607cf1b8d3SKris Buschelman } 47617cf1b8d3SKris Buschelman 47627cf1b8d3SKris Buschelman } /* End of artificial scope. */ 47631ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 47641ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4765dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 47667cf1b8d3SKris Buschelman SSE_SCOPE_END; 47677cf1b8d3SKris Buschelman PetscFunctionReturn(0); 47687cf1b8d3SKris Buschelman } 47697cf1b8d3SKris Buschelman 47703660e330SKris Buschelman #endif 47718f690400SShri Abhyankar 47724a2ae208SSatish Balay #undef __FUNCT__ 477306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 477406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 47754e2b4712SSatish Balay { 47764e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 47774e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 47786849ba73SBarry Smith PetscErrorCode ierr; 4779b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 4780b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 47815d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4782d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4783d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4784d9fead3dSBarry Smith const PetscScalar *b; 47854e2b4712SSatish Balay 47864e2b4712SSatish Balay PetscFunctionBegin; 47873649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 47881ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4789f1af5d2fSBarry Smith t = a->solve_work; 47904e2b4712SSatish Balay 47914e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 47924e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 47934e2b4712SSatish Balay 47944e2b4712SSatish Balay /* forward solve the lower triangular */ 47954e2b4712SSatish Balay idx = 3*(*r++); 4796f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 47974e2b4712SSatish Balay for (i=1; i<n; i++) { 47984e2b4712SSatish Balay v = aa + 9*ai[i]; 47994e2b4712SSatish Balay vi = aj + ai[i]; 48004e2b4712SSatish Balay nz = diag[i] - ai[i]; 48014e2b4712SSatish Balay idx = 3*(*r++); 4802f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 48034e2b4712SSatish Balay while (nz--) { 48044e2b4712SSatish Balay idx = 3*(*vi++); 4805f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4806f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4807f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4808f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48094e2b4712SSatish Balay v += 9; 48104e2b4712SSatish Balay } 48114e2b4712SSatish Balay idx = 3*i; 4812f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 48134e2b4712SSatish Balay } 48144e2b4712SSatish Balay /* backward solve the upper triangular */ 48154e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 48164e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 48174e2b4712SSatish Balay vi = aj + diag[i] + 1; 48184e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 48194e2b4712SSatish Balay idt = 3*i; 4820f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 48214e2b4712SSatish Balay while (nz--) { 48224e2b4712SSatish Balay idx = 3*(*vi++); 4823f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4824f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4825f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4826f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48274e2b4712SSatish Balay v += 9; 48284e2b4712SSatish Balay } 48294e2b4712SSatish Balay idc = 3*(*c--); 48304e2b4712SSatish Balay v = aa + 9*diag[i]; 4831f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4832f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4833f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 48344e2b4712SSatish Balay } 48354e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 48364e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 48373649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 48381ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4839dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 48404e2b4712SSatish Balay PetscFunctionReturn(0); 48414e2b4712SSatish Balay } 48424e2b4712SSatish Balay 48430c4413a7SShri Abhyankar #undef __FUNCT__ 48444dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3" 48454dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 48460c4413a7SShri Abhyankar { 48470c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 48480c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 48490c4413a7SShri Abhyankar PetscErrorCode ierr; 4850b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4851b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 48520c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 48530c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 48540c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 48550c4413a7SShri Abhyankar const PetscScalar *b; 48560c4413a7SShri Abhyankar 48570c4413a7SShri Abhyankar PetscFunctionBegin; 48583649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 48590c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 48600c4413a7SShri Abhyankar t = a->solve_work; 48610c4413a7SShri Abhyankar 48620c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 48630c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 48640c4413a7SShri Abhyankar 48650c4413a7SShri Abhyankar /* forward solve the lower triangular */ 48660c4413a7SShri Abhyankar idx = 3*r[0]; 48670c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 48680c4413a7SShri Abhyankar for (i=1; i<n; i++) { 48690c4413a7SShri Abhyankar v = aa + 9*ai[i]; 48700c4413a7SShri Abhyankar vi = aj + ai[i]; 48710c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 48720c4413a7SShri Abhyankar idx = 3*r[i]; 48730c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 48740c4413a7SShri Abhyankar for (m=0;m<nz;m++) { 48750c4413a7SShri Abhyankar idx = 3*vi[m]; 48760c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 48770c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 48780c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 48790c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48800c4413a7SShri Abhyankar v += 9; 48810c4413a7SShri Abhyankar } 48820c4413a7SShri Abhyankar idx = 3*i; 48830c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 48840c4413a7SShri Abhyankar } 48850c4413a7SShri Abhyankar /* backward solve the upper triangular */ 48860c4413a7SShri Abhyankar for (i=n-1; i>=0; i--) { 48870c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 48880c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 48890c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 48900c4413a7SShri Abhyankar idt = 3*i; 48910c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 48920c4413a7SShri Abhyankar for (m=0;m<nz;m++) { 48930c4413a7SShri Abhyankar idx = 3*vi[m]; 48940c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 48950c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 48960c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 48970c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48980c4413a7SShri Abhyankar v += 9; 48990c4413a7SShri Abhyankar } 49000c4413a7SShri Abhyankar idc = 3*c[i]; 49010c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 49020c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 49030c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 49040c4413a7SShri Abhyankar } 49050c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 49060c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 49073649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 49080c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 49090c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 49100c4413a7SShri Abhyankar PetscFunctionReturn(0); 49110c4413a7SShri Abhyankar } 49120c4413a7SShri Abhyankar 491315091d37SBarry Smith /* 491415091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 491515091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 491615091d37SBarry Smith */ 49174a2ae208SSatish Balay #undef __FUNCT__ 491806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 491906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 492015091d37SBarry Smith { 492115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 49220b68f018SBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4923dfbe8321SBarry Smith PetscErrorCode ierr; 49240b68f018SBarry Smith const PetscInt *diag = a->diag,*vi; 4925d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4926d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 4927d9fead3dSBarry Smith const PetscScalar *b; 49280b68f018SBarry Smith PetscInt jdx,idt,idx,nz,i; 492915091d37SBarry Smith 493015091d37SBarry Smith PetscFunctionBegin; 49313649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 49321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 493315091d37SBarry Smith 493415091d37SBarry Smith /* forward solve the lower triangular */ 493515091d37SBarry Smith idx = 0; 493615091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 493715091d37SBarry Smith for (i=1; i<n; i++) { 493815091d37SBarry Smith v = aa + 9*ai[i]; 493915091d37SBarry Smith vi = aj + ai[i]; 494015091d37SBarry Smith nz = diag[i] - ai[i]; 494115091d37SBarry Smith idx += 3; 4942f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 494315091d37SBarry Smith while (nz--) { 494415091d37SBarry Smith jdx = 3*(*vi++); 494515091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4946f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4947f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4948f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 494915091d37SBarry Smith v += 9; 495015091d37SBarry Smith } 4951f1af5d2fSBarry Smith x[idx] = s1; 4952f1af5d2fSBarry Smith x[1+idx] = s2; 4953f1af5d2fSBarry Smith x[2+idx] = s3; 495415091d37SBarry Smith } 495515091d37SBarry Smith /* backward solve the upper triangular */ 495615091d37SBarry Smith for (i=n-1; i>=0; i--) { 495715091d37SBarry Smith v = aa + 9*diag[i] + 9; 495815091d37SBarry Smith vi = aj + diag[i] + 1; 495915091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 496015091d37SBarry Smith idt = 3*i; 4961f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4962f1af5d2fSBarry Smith s3 = x[2+idt]; 496315091d37SBarry Smith while (nz--) { 496415091d37SBarry Smith idx = 3*(*vi++); 496515091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4966f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4967f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4968f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 496915091d37SBarry Smith v += 9; 497015091d37SBarry Smith } 497115091d37SBarry Smith v = aa + 9*diag[i]; 4972f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4973f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4974f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 497515091d37SBarry Smith } 497615091d37SBarry Smith 49773649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 49781ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4979dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 498015091d37SBarry Smith PetscFunctionReturn(0); 498115091d37SBarry Smith } 498215091d37SBarry Smith 4983cee9d6f2SShri Abhyankar #undef __FUNCT__ 49844dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 49854dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4986b2b2dd24SShri Abhyankar { 4987b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4988b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4989b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4990b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 4991b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4992b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4993b2b2dd24SShri Abhyankar PetscScalar *x; 4994b2b2dd24SShri Abhyankar const PetscScalar *b; 4995b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4996b2b2dd24SShri Abhyankar 4997b2b2dd24SShri Abhyankar PetscFunctionBegin; 49983649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4999b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5000b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 5001b2b2dd24SShri Abhyankar idx = 0; 5002b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 5003b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 5004b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 5005b2b2dd24SShri Abhyankar vi = aj + ai[i]; 5006b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 5007b2b2dd24SShri Abhyankar idx = bs*i; 5008b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 5009b2b2dd24SShri Abhyankar for (k=0;k<nz;k++) { 5010b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 5011b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 5012b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5013b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5014b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5015b2b2dd24SShri Abhyankar 5016b2b2dd24SShri Abhyankar v += bs2; 5017b2b2dd24SShri Abhyankar } 5018b2b2dd24SShri Abhyankar 5019b2b2dd24SShri Abhyankar x[idx] = s1; 5020b2b2dd24SShri Abhyankar x[1+idx] = s2; 5021b2b2dd24SShri Abhyankar x[2+idx] = s3; 5022b2b2dd24SShri Abhyankar } 5023b2b2dd24SShri Abhyankar 5024b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 5025b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--) { 5026b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 5027b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 5028b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5029b2b2dd24SShri Abhyankar idt = bs*i; 5030b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 5031b2b2dd24SShri Abhyankar 5032b2b2dd24SShri Abhyankar for (k=0;k<nz;k++) { 5033b2b2dd24SShri Abhyankar idx = bs*vi[k]; 5034b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 5035b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5036b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5037b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5038b2b2dd24SShri Abhyankar 5039b2b2dd24SShri Abhyankar v += bs2; 5040b2b2dd24SShri Abhyankar } 5041b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 5042b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5043b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5044b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 5045b2b2dd24SShri Abhyankar 5046b2b2dd24SShri Abhyankar } 5047b2b2dd24SShri Abhyankar 50483649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5049b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5050b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5051b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 5052b2b2dd24SShri Abhyankar } 5053b2b2dd24SShri Abhyankar 5054b2b2dd24SShri Abhyankar #undef __FUNCT__ 505506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 505606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 50574e2b4712SSatish Balay { 50584e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 50594e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 50606849ba73SBarry Smith PetscErrorCode ierr; 5061b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5062b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 50635d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5064d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 5065d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 5066d9fead3dSBarry Smith const PetscScalar *b; 50674e2b4712SSatish Balay 50684e2b4712SSatish Balay PetscFunctionBegin; 50693649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 50701ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5071f1af5d2fSBarry Smith t = a->solve_work; 50724e2b4712SSatish Balay 50734e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 50744e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 50754e2b4712SSatish Balay 50764e2b4712SSatish Balay /* forward solve the lower triangular */ 50774e2b4712SSatish Balay idx = 2*(*r++); 5078f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 50794e2b4712SSatish Balay for (i=1; i<n; i++) { 50804e2b4712SSatish Balay v = aa + 4*ai[i]; 50814e2b4712SSatish Balay vi = aj + ai[i]; 50824e2b4712SSatish Balay nz = diag[i] - ai[i]; 50834e2b4712SSatish Balay idx = 2*(*r++); 5084f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 50854e2b4712SSatish Balay while (nz--) { 50864e2b4712SSatish Balay idx = 2*(*vi++); 5087f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 5088f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5089f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 50904e2b4712SSatish Balay v += 4; 50914e2b4712SSatish Balay } 50924e2b4712SSatish Balay idx = 2*i; 5093f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 50944e2b4712SSatish Balay } 50954e2b4712SSatish Balay /* backward solve the upper triangular */ 50964e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 50974e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 50984e2b4712SSatish Balay vi = aj + diag[i] + 1; 50994e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 51004e2b4712SSatish Balay idt = 2*i; 5101f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 51024e2b4712SSatish Balay while (nz--) { 51034e2b4712SSatish Balay idx = 2*(*vi++); 5104f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 5105f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5106f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 51074e2b4712SSatish Balay v += 4; 51084e2b4712SSatish Balay } 51094e2b4712SSatish Balay idc = 2*(*c--); 51104e2b4712SSatish Balay v = aa + 4*diag[i]; 5111f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5112f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 51134e2b4712SSatish Balay } 51144e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 51154e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 51163649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 51171ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5118dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 51194e2b4712SSatish Balay PetscFunctionReturn(0); 51204e2b4712SSatish Balay } 51214e2b4712SSatish Balay 51220c4413a7SShri Abhyankar #undef __FUNCT__ 51234dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2" 51244dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 51250c4413a7SShri Abhyankar { 51260c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 51270c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 51280c4413a7SShri Abhyankar PetscErrorCode ierr; 5129b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5130b3260449SShri Abhyankar PetscInt i,nz,idx,jdx,idt,idc,m; 51310c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 51320c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 51330c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 51340c4413a7SShri Abhyankar const PetscScalar *b; 51350c4413a7SShri Abhyankar 51360c4413a7SShri Abhyankar PetscFunctionBegin; 51373649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 51380c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 51390c4413a7SShri Abhyankar t = a->solve_work; 51400c4413a7SShri Abhyankar 51410c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 51420c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 51430c4413a7SShri Abhyankar 51440c4413a7SShri Abhyankar /* forward solve the lower triangular */ 51450c4413a7SShri Abhyankar idx = 2*r[0]; 51460c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 51470c4413a7SShri Abhyankar for (i=1; i<n; i++) { 51480c4413a7SShri Abhyankar v = aa + 4*ai[i]; 51490c4413a7SShri Abhyankar vi = aj + ai[i]; 51500c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 51510c4413a7SShri Abhyankar idx = 2*r[i]; 51520c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 51530c4413a7SShri Abhyankar for (m=0;m<nz;m++) { 51540c4413a7SShri Abhyankar jdx = 2*vi[m]; 51550c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 51560c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 51570c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 51580c4413a7SShri Abhyankar v += 4; 51590c4413a7SShri Abhyankar } 51600c4413a7SShri Abhyankar idx = 2*i; 51610c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 51620c4413a7SShri Abhyankar } 51630c4413a7SShri Abhyankar /* backward solve the upper triangular */ 51640c4413a7SShri Abhyankar for (i=n-1; i>=0; i--) { 51650c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 51660c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 51670c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 51680c4413a7SShri Abhyankar idt = 2*i; 51690c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 51700c4413a7SShri Abhyankar for (m=0;m<nz;m++) { 51710c4413a7SShri Abhyankar idx = 2*vi[m]; 51720c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 51730c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 51740c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 51750c4413a7SShri Abhyankar v += 4; 51760c4413a7SShri Abhyankar } 51770c4413a7SShri Abhyankar idc = 2*c[i]; 51780c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 51790c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 51800c4413a7SShri Abhyankar } 51810c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 51820c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 51833649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 51840c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 51850c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 51860c4413a7SShri Abhyankar PetscFunctionReturn(0); 51870c4413a7SShri Abhyankar } 51888f690400SShri Abhyankar 518915091d37SBarry Smith /* 519015091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 519115091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 519215091d37SBarry Smith */ 51934a2ae208SSatish Balay #undef __FUNCT__ 519406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 519506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 519615091d37SBarry Smith { 519715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5198b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5199dfbe8321SBarry Smith PetscErrorCode ierr; 5200d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 5201d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 5202d9fead3dSBarry Smith const PetscScalar *b; 5203b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i; 520415091d37SBarry Smith 520515091d37SBarry Smith PetscFunctionBegin; 52063649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 52071ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 520815091d37SBarry Smith 520915091d37SBarry Smith /* forward solve the lower triangular */ 521015091d37SBarry Smith idx = 0; 521115091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 521215091d37SBarry Smith for (i=1; i<n; i++) { 521315091d37SBarry Smith v = aa + 4*ai[i]; 521415091d37SBarry Smith vi = aj + ai[i]; 521515091d37SBarry Smith nz = diag[i] - ai[i]; 521615091d37SBarry Smith idx += 2; 5217f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 521815091d37SBarry Smith while (nz--) { 521915091d37SBarry Smith jdx = 2*(*vi++); 522015091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 5221f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5222f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 522315091d37SBarry Smith v += 4; 522415091d37SBarry Smith } 5225f1af5d2fSBarry Smith x[idx] = s1; 5226f1af5d2fSBarry Smith x[1+idx] = s2; 522715091d37SBarry Smith } 522815091d37SBarry Smith /* backward solve the upper triangular */ 522915091d37SBarry Smith for (i=n-1; i>=0; i--) { 523015091d37SBarry Smith v = aa + 4*diag[i] + 4; 523115091d37SBarry Smith vi = aj + diag[i] + 1; 523215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 523315091d37SBarry Smith idt = 2*i; 5234f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 523515091d37SBarry Smith while (nz--) { 523615091d37SBarry Smith idx = 2*(*vi++); 523715091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 5238f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5239f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 524015091d37SBarry Smith v += 4; 524115091d37SBarry Smith } 524215091d37SBarry Smith v = aa + 4*diag[i]; 5243f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 5244f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 524515091d37SBarry Smith } 524615091d37SBarry Smith 52473649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 52481ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5249dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 525015091d37SBarry Smith PetscFunctionReturn(0); 525115091d37SBarry Smith } 525215091d37SBarry Smith 5253cee9d6f2SShri Abhyankar #undef __FUNCT__ 52544dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 52554dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5256b2b2dd24SShri Abhyankar { 5257b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5258b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5259b3260449SShri Abhyankar PetscInt i,k,nz,idx,idt,jdx; 5260b2b2dd24SShri Abhyankar PetscErrorCode ierr; 5261b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 5262b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 5263b2b2dd24SShri Abhyankar const PetscScalar *b; 5264b2b2dd24SShri Abhyankar 5265b2b2dd24SShri Abhyankar PetscFunctionBegin; 52663649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5267b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5268b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 5269b2b2dd24SShri Abhyankar idx = 0; 5270b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 5271b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 5272b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 5273b2b2dd24SShri Abhyankar vi = aj + ai[i]; 5274b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 5275b2b2dd24SShri Abhyankar idx = 2*i; 5276b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 52774c0dbd8dSJed Brown PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA); 52784c0dbd8dSJed Brown PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); 5279b2b2dd24SShri Abhyankar for (k=0;k<nz;k++) { 5280b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 5281b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 5282b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 5283b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 5284b2b2dd24SShri Abhyankar v += 4; 5285b2b2dd24SShri Abhyankar } 5286b2b2dd24SShri Abhyankar x[idx] = s1; 5287b2b2dd24SShri Abhyankar x[1+idx] = s2; 5288b2b2dd24SShri Abhyankar } 5289b2b2dd24SShri Abhyankar 5290b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 5291b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--) { 5292b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 5293b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 5294b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5295b2b2dd24SShri Abhyankar idt = 2*i; 5296b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 52974c0dbd8dSJed Brown PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA); 52984c0dbd8dSJed Brown PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); 5299b2b2dd24SShri Abhyankar for (k=0;k<nz;k++) { 5300b2b2dd24SShri Abhyankar idx = 2*vi[k]; 5301b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 5302b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 5303b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 5304b2b2dd24SShri Abhyankar v += 4; 5305b2b2dd24SShri Abhyankar } 5306b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 5307b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 5308b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 5309b2b2dd24SShri Abhyankar } 5310b2b2dd24SShri Abhyankar 53113649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5312b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5313b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5314b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 5315b2b2dd24SShri Abhyankar } 5316b2b2dd24SShri Abhyankar 5317b2b2dd24SShri Abhyankar #undef __FUNCT__ 531806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 531906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 53204e2b4712SSatish Balay { 53214e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 53224e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 53236849ba73SBarry Smith PetscErrorCode ierr; 5324b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5325b3260449SShri Abhyankar PetscInt i,nz; 53265d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5327b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 5328b3260449SShri Abhyankar PetscScalar *x,s1,*t; 5329b3260449SShri Abhyankar const PetscScalar *b; 53304e2b4712SSatish Balay 53314e2b4712SSatish Balay PetscFunctionBegin; 53324e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 53334e2b4712SSatish Balay 53343649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 53351ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5336f1af5d2fSBarry Smith t = a->solve_work; 53374e2b4712SSatish Balay 53384e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 53394e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 53404e2b4712SSatish Balay 53414e2b4712SSatish Balay /* forward solve the lower triangular */ 5342f1af5d2fSBarry Smith t[0] = b[*r++]; 53434e2b4712SSatish Balay for (i=1; i<n; i++) { 53444e2b4712SSatish Balay v = aa + ai[i]; 53454e2b4712SSatish Balay vi = aj + ai[i]; 53464e2b4712SSatish Balay nz = diag[i] - ai[i]; 5347f1af5d2fSBarry Smith s1 = b[*r++]; 53484e2b4712SSatish Balay while (nz--) { 5349f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 53504e2b4712SSatish Balay } 5351f1af5d2fSBarry Smith t[i] = s1; 53524e2b4712SSatish Balay } 53534e2b4712SSatish Balay /* backward solve the upper triangular */ 53544e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 53554e2b4712SSatish Balay v = aa + diag[i] + 1; 53564e2b4712SSatish Balay vi = aj + diag[i] + 1; 53574e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 5358f1af5d2fSBarry Smith s1 = t[i]; 53594e2b4712SSatish Balay while (nz--) { 5360f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 53614e2b4712SSatish Balay } 5362f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 53634e2b4712SSatish Balay } 53644e2b4712SSatish Balay 53654e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 53664e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 53673649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 53681ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5369dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 53704e2b4712SSatish Balay PetscFunctionReturn(0); 53714e2b4712SSatish Balay } 5372048b5e81SShri Abhyankar 5373048b5e81SShri Abhyankar #undef __FUNCT__ 5374048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1" 5375048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 5376048b5e81SShri Abhyankar { 5377048b5e81SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5378048b5e81SShri Abhyankar IS iscol = a->col,isrow = a->row; 5379048b5e81SShri Abhyankar PetscErrorCode ierr; 5380048b5e81SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 5381048b5e81SShri Abhyankar const PetscInt *rout,*cout,*r,*c; 5382048b5e81SShri Abhyankar PetscScalar *x,*tmp,sum; 5383048b5e81SShri Abhyankar const PetscScalar *b; 5384048b5e81SShri Abhyankar const MatScalar *aa = a->a,*v; 5385048b5e81SShri Abhyankar 5386048b5e81SShri Abhyankar PetscFunctionBegin; 5387048b5e81SShri Abhyankar if (!n) PetscFunctionReturn(0); 5388048b5e81SShri Abhyankar 53893649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5390048b5e81SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5391048b5e81SShri Abhyankar tmp = a->solve_work; 5392048b5e81SShri Abhyankar 5393048b5e81SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5394048b5e81SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5395048b5e81SShri Abhyankar 5396048b5e81SShri Abhyankar /* forward solve the lower triangular */ 5397048b5e81SShri Abhyankar tmp[0] = b[r[0]]; 5398048b5e81SShri Abhyankar v = aa; 5399048b5e81SShri Abhyankar vi = aj; 5400048b5e81SShri Abhyankar for (i=1; i<n; i++) { 5401048b5e81SShri Abhyankar nz = ai[i+1] - ai[i]; 5402048b5e81SShri Abhyankar sum = b[r[i]]; 5403048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5404048b5e81SShri Abhyankar tmp[i] = sum; 5405048b5e81SShri Abhyankar v += nz; vi += nz; 5406048b5e81SShri Abhyankar } 5407048b5e81SShri Abhyankar 5408048b5e81SShri Abhyankar /* backward solve the upper triangular */ 5409048b5e81SShri Abhyankar for (i=n-1; i>=0; i--) { 5410048b5e81SShri Abhyankar v = aa + adiag[i+1]+1; 5411048b5e81SShri Abhyankar vi = aj + adiag[i+1]+1; 5412048b5e81SShri Abhyankar nz = adiag[i]-adiag[i+1]-1; 5413048b5e81SShri Abhyankar sum = tmp[i]; 5414048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5415048b5e81SShri Abhyankar x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 5416048b5e81SShri Abhyankar } 5417048b5e81SShri Abhyankar 5418048b5e81SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5419048b5e81SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 54203649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5421048b5e81SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5422048b5e81SShri Abhyankar ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 5423048b5e81SShri Abhyankar PetscFunctionReturn(0); 5424048b5e81SShri Abhyankar } 5425048b5e81SShri Abhyankar 542615091d37SBarry Smith /* 542715091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 542815091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 542915091d37SBarry Smith */ 54304a2ae208SSatish Balay #undef __FUNCT__ 543106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 543206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 543315091d37SBarry Smith { 543415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5435b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5436dfbe8321SBarry Smith PetscErrorCode ierr; 5437b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 5438b3260449SShri Abhyankar PetscScalar *x; 5439b3260449SShri Abhyankar const PetscScalar *b; 544087828ca2SBarry Smith PetscScalar s1,x1; 5441b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i; 544215091d37SBarry Smith 544315091d37SBarry Smith PetscFunctionBegin; 54443649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 54451ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544615091d37SBarry Smith 544715091d37SBarry Smith /* forward solve the lower triangular */ 544815091d37SBarry Smith idx = 0; 544915091d37SBarry Smith x[0] = b[0]; 545015091d37SBarry Smith for (i=1; i<n; i++) { 545115091d37SBarry Smith v = aa + ai[i]; 545215091d37SBarry Smith vi = aj + ai[i]; 545315091d37SBarry Smith nz = diag[i] - ai[i]; 545415091d37SBarry Smith idx += 1; 5455f1af5d2fSBarry Smith s1 = b[idx]; 545615091d37SBarry Smith while (nz--) { 545715091d37SBarry Smith jdx = *vi++; 545815091d37SBarry Smith x1 = x[jdx]; 5459f1af5d2fSBarry Smith s1 -= v[0]*x1; 546015091d37SBarry Smith v += 1; 546115091d37SBarry Smith } 5462f1af5d2fSBarry Smith x[idx] = s1; 546315091d37SBarry Smith } 546415091d37SBarry Smith /* backward solve the upper triangular */ 546515091d37SBarry Smith for (i=n-1; i>=0; i--) { 546615091d37SBarry Smith v = aa + diag[i] + 1; 546715091d37SBarry Smith vi = aj + diag[i] + 1; 546815091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 546915091d37SBarry Smith idt = i; 5470f1af5d2fSBarry Smith s1 = x[idt]; 547115091d37SBarry Smith while (nz--) { 547215091d37SBarry Smith idx = *vi++; 547315091d37SBarry Smith x1 = x[idx]; 5474f1af5d2fSBarry Smith s1 -= v[0]*x1; 547515091d37SBarry Smith v += 1; 547615091d37SBarry Smith } 547715091d37SBarry Smith v = aa + diag[i]; 5478f1af5d2fSBarry Smith x[idt] = v[0]*s1; 547915091d37SBarry Smith } 54803649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 54811ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5482dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 548315091d37SBarry Smith PetscFunctionReturn(0); 548415091d37SBarry Smith } 54854e2b4712SSatish Balay 5486048b5e81SShri Abhyankar 5487048b5e81SShri Abhyankar #undef __FUNCT__ 5488048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5489048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 5490048b5e81SShri Abhyankar { 5491048b5e81SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5492048b5e81SShri Abhyankar PetscErrorCode ierr; 5493048b5e81SShri Abhyankar const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi; 5494048b5e81SShri Abhyankar PetscScalar *x,sum; 5495048b5e81SShri Abhyankar const PetscScalar *b; 5496048b5e81SShri Abhyankar const MatScalar *aa = a->a,*v; 5497048b5e81SShri Abhyankar PetscInt i,nz; 5498048b5e81SShri Abhyankar 5499048b5e81SShri Abhyankar PetscFunctionBegin; 5500048b5e81SShri Abhyankar if (!n) PetscFunctionReturn(0); 5501048b5e81SShri Abhyankar 55023649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5503048b5e81SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5504048b5e81SShri Abhyankar 5505048b5e81SShri Abhyankar /* forward solve the lower triangular */ 5506048b5e81SShri Abhyankar x[0] = b[0]; 5507048b5e81SShri Abhyankar v = aa; 5508048b5e81SShri Abhyankar vi = aj; 5509048b5e81SShri Abhyankar for (i=1; i<n; i++) { 5510048b5e81SShri Abhyankar nz = ai[i+1] - ai[i]; 5511048b5e81SShri Abhyankar sum = b[i]; 5512048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5513048b5e81SShri Abhyankar v += nz; 5514048b5e81SShri Abhyankar vi += nz; 5515048b5e81SShri Abhyankar x[i] = sum; 5516048b5e81SShri Abhyankar } 5517048b5e81SShri Abhyankar 5518048b5e81SShri Abhyankar /* backward solve the upper triangular */ 5519048b5e81SShri Abhyankar for (i=n-1; i>=0; i--) { 5520048b5e81SShri Abhyankar v = aa + adiag[i+1] + 1; 5521048b5e81SShri Abhyankar vi = aj + adiag[i+1] + 1; 5522048b5e81SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5523048b5e81SShri Abhyankar sum = x[i]; 5524048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5525048b5e81SShri Abhyankar x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */ 5526048b5e81SShri Abhyankar } 5527048b5e81SShri Abhyankar 5528048b5e81SShri Abhyankar ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 55293649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5530048b5e81SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5531048b5e81SShri Abhyankar PetscFunctionReturn(0); 5532048b5e81SShri Abhyankar } 5533048b5e81SShri Abhyankar 55344e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 553509573ac7SBarry Smith extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool); 55366bce7ff8SHong Zhang 55372b0b2ea7SShri Abhyankar #undef __FUNCT__ 553829a97285SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering" 5539766f9fbaSBarry Smith /* 5540766f9fbaSBarry Smith This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 5541766f9fbaSBarry Smith */ 554229a97285SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 55432b0b2ea7SShri Abhyankar { 55442b0b2ea7SShri Abhyankar Mat C=B; 55452b0b2ea7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 55462b0b2ea7SShri Abhyankar PetscErrorCode ierr; 5547766f9fbaSBarry Smith PetscInt i,j,k,ipvt[15]; 5548766f9fbaSBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj; 5549766f9fbaSBarry Smith PetscInt nz,nzL,row; 5550766f9fbaSBarry Smith MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225]; 5551766f9fbaSBarry Smith const MatScalar *v,*aa=a->a; 55522b0b2ea7SShri Abhyankar PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 55530fa040f9SShri Abhyankar PetscInt sol_ver; 55542b0b2ea7SShri Abhyankar 55552b0b2ea7SShri Abhyankar PetscFunctionBegin; 5556c55dd799SBarry Smith ierr = PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr); 55570fa040f9SShri Abhyankar 55582b0b2ea7SShri Abhyankar /* generate work space needed by the factorization */ 55592b0b2ea7SShri Abhyankar ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 55602b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 55612b0b2ea7SShri Abhyankar 55622b0b2ea7SShri Abhyankar for (i=0; i<n; i++) { 55632b0b2ea7SShri Abhyankar /* zero rtmp */ 55642b0b2ea7SShri Abhyankar /* L part */ 55652b0b2ea7SShri Abhyankar nz = bi[i+1] - bi[i]; 55662b0b2ea7SShri Abhyankar bjtmp = bj + bi[i]; 55672b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 55682b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 55692b0b2ea7SShri Abhyankar } 55702b0b2ea7SShri Abhyankar 55712b0b2ea7SShri Abhyankar /* U part */ 55722b0b2ea7SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 55732b0b2ea7SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 55742b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 55752b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 55762b0b2ea7SShri Abhyankar } 55772b0b2ea7SShri Abhyankar 55782b0b2ea7SShri Abhyankar /* load in initial (unfactored row) */ 557929a97285SShri Abhyankar nz = ai[i+1] - ai[i]; 558029a97285SShri Abhyankar ajtmp = aj + ai[i]; 558129a97285SShri Abhyankar v = aa + bs2*ai[i]; 55822b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 558329a97285SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 55842b0b2ea7SShri Abhyankar } 55852b0b2ea7SShri Abhyankar 55862b0b2ea7SShri Abhyankar /* elimination */ 55872b0b2ea7SShri Abhyankar bjtmp = bj + bi[i]; 55882b0b2ea7SShri Abhyankar nzL = bi[i+1] - bi[i]; 55892b0b2ea7SShri Abhyankar for (k=0;k < nzL;k++) { 55902b0b2ea7SShri Abhyankar row = bjtmp[k]; 55912b0b2ea7SShri Abhyankar pc = rtmp + bs2*row; 5592c35f09e5SBarry Smith for (flg=0,j=0; j<bs2; j++) { 5593c35f09e5SBarry Smith if (pc[j]!=0.0) { 5594c35f09e5SBarry Smith flg = 1; 5595c35f09e5SBarry Smith break; 5596c35f09e5SBarry Smith } 5597c35f09e5SBarry Smith } 55982b0b2ea7SShri Abhyankar if (flg) { 55992b0b2ea7SShri Abhyankar pv = b->a + bs2*bdiag[row]; 560096b95a6bSBarry Smith PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); 560196b95a6bSBarry Smith /*ierr = PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/ 56022b0b2ea7SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 56032b0b2ea7SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 56042b0b2ea7SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 56052b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 5606766f9fbaSBarry Smith vv = rtmp + bs2*pj[j]; 560796b95a6bSBarry Smith PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv); 560896b95a6bSBarry Smith /* ierr = PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */ 56092b0b2ea7SShri Abhyankar pv += bs2; 56102b0b2ea7SShri Abhyankar } 5611766f9fbaSBarry Smith ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 56122b0b2ea7SShri Abhyankar } 56132b0b2ea7SShri Abhyankar } 56142b0b2ea7SShri Abhyankar 56152b0b2ea7SShri Abhyankar /* finished row so stick it into b->a */ 56162b0b2ea7SShri Abhyankar /* L part */ 56172b0b2ea7SShri Abhyankar pv = b->a + bs2*bi[i] ; 56182b0b2ea7SShri Abhyankar pj = b->j + bi[i] ; 56192b0b2ea7SShri Abhyankar nz = bi[i+1] - bi[i]; 56202b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 56212b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56222b0b2ea7SShri Abhyankar } 56232b0b2ea7SShri Abhyankar 56242b0b2ea7SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 56252b0b2ea7SShri Abhyankar pv = b->a + bs2*bdiag[i]; 56262b0b2ea7SShri Abhyankar pj = b->j + bdiag[i]; 56272b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 562896b95a6bSBarry Smith /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */ 562996b95a6bSBarry Smith ierr = PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr); 56302b0b2ea7SShri Abhyankar 56312b0b2ea7SShri Abhyankar /* U part */ 56322b0b2ea7SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 56332b0b2ea7SShri Abhyankar pj = b->j + bdiag[i+1]+1; 56342b0b2ea7SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 56352b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 56362b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56372b0b2ea7SShri Abhyankar } 56382b0b2ea7SShri Abhyankar } 56392b0b2ea7SShri Abhyankar 56402b0b2ea7SShri Abhyankar ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5641832cc040SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 5642766f9fbaSBarry Smith C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 56432b0b2ea7SShri Abhyankar C->assembled = PETSC_TRUE; 5644766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 56452b0b2ea7SShri Abhyankar PetscFunctionReturn(0); 56462b0b2ea7SShri Abhyankar } 56472b0b2ea7SShri Abhyankar 56486bce7ff8SHong Zhang #undef __FUNCT__ 56494dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 56504dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 56516bce7ff8SHong Zhang { 56526bce7ff8SHong Zhang Mat C=B; 56536bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 56546bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 56556bce7ff8SHong Zhang PetscErrorCode ierr; 56565a586d82SBarry Smith const PetscInt *r,*ic; 56576bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 56586bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5659b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5660914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5661914a18a2SHong Zhang MatScalar *v_work; 5662ace3abfcSBarry Smith PetscBool col_identity,row_identity,both_identity; 56636bce7ff8SHong Zhang 56646bce7ff8SHong Zhang PetscFunctionBegin; 56656bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 56666bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5667ae3d28f0SHong Zhang 5668fca92195SBarry Smith ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5669fca92195SBarry Smith ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 56706bce7ff8SHong Zhang 5671914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 5672fca92195SBarry Smith ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5673914a18a2SHong Zhang 56746bce7ff8SHong Zhang for (i=0; i<n; i++) { 56756bce7ff8SHong Zhang /* zero rtmp */ 56766bce7ff8SHong Zhang /* L part */ 56776bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 56786bce7ff8SHong Zhang bjtmp = bj + bi[i]; 5679914a18a2SHong Zhang for (j=0; j<nz; j++) { 5680914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5681914a18a2SHong Zhang } 56826bce7ff8SHong Zhang 56836bce7ff8SHong Zhang /* U part */ 56841a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 56851a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 56861a83e813SShri Abhyankar for (j=0; j<nz; j++) { 56871a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56881a83e813SShri Abhyankar } 56891a83e813SShri Abhyankar 56901a83e813SShri Abhyankar /* load in initial (unfactored row) */ 56911a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 56921a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 56931a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 56941a83e813SShri Abhyankar for (j=0; j<nz; j++) { 56951a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 56961a83e813SShri Abhyankar } 56971a83e813SShri Abhyankar 56981a83e813SShri Abhyankar /* elimination */ 56991a83e813SShri Abhyankar bjtmp = bj + bi[i]; 57001a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 57011a83e813SShri Abhyankar for (k=0;k < nzL;k++) { 57021a83e813SShri Abhyankar row = bjtmp[k]; 57031a83e813SShri Abhyankar pc = rtmp + bs2*row; 5704c35f09e5SBarry Smith for (flg=0,j=0; j<bs2; j++) { 5705c35f09e5SBarry Smith if (pc[j]!=0.0) { 5706c35f09e5SBarry Smith flg = 1; 5707c35f09e5SBarry Smith break; 5708c35f09e5SBarry Smith } 5709c35f09e5SBarry Smith } 57101a83e813SShri Abhyankar if (flg) { 57111a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 571296b95a6bSBarry Smith PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 57131a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 57141a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 57151a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 57161a83e813SShri Abhyankar for (j=0; j<nz; j++) { 571796b95a6bSBarry Smith PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 57181a83e813SShri Abhyankar } 57191a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 57201a83e813SShri Abhyankar } 57211a83e813SShri Abhyankar } 57221a83e813SShri Abhyankar 57231a83e813SShri Abhyankar /* finished row so stick it into b->a */ 57241a83e813SShri Abhyankar /* L part */ 57251a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 57261a83e813SShri Abhyankar pj = b->j + bi[i] ; 57271a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 57281a83e813SShri Abhyankar for (j=0; j<nz; j++) { 57291a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 57301a83e813SShri Abhyankar } 57311a83e813SShri Abhyankar 57321a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 57331a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 57341a83e813SShri Abhyankar pj = b->j + bdiag[i]; 5735e32f2f54SBarry Smith /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 57361a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 573796b95a6bSBarry Smith ierr = PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 57381a83e813SShri Abhyankar 57391a83e813SShri Abhyankar /* U part */ 57401a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 57411a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 57421a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 57431a83e813SShri Abhyankar for (j=0; j<nz; j++) { 57441a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 57451a83e813SShri Abhyankar } 57461a83e813SShri Abhyankar } 57471a83e813SShri Abhyankar 57481a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 5749fca92195SBarry Smith ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 57501a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 57511a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 57521a83e813SShri Abhyankar 5753ae3d28f0SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5754ae3d28f0SHong Zhang ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5755ace3abfcSBarry Smith both_identity = (PetscBool) (row_identity && col_identity); 5756ae3d28f0SHong Zhang if (both_identity) { 57574dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5758ae3d28f0SHong Zhang } else { 57594dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N; 5760ae3d28f0SHong Zhang } 57614dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5762ae3d28f0SHong Zhang 57631a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 5764766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 57651a83e813SShri Abhyankar PetscFunctionReturn(0); 57661a83e813SShri Abhyankar } 57671a83e813SShri Abhyankar 57686bce7ff8SHong Zhang /* 57696bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 57704dd39f65SShri Abhyankar See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 57714dd39f65SShri Abhyankar because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 57726bce7ff8SHong Zhang */ 5773c0c7eb62SShri Abhyankar 57746bce7ff8SHong Zhang #undef __FUNCT__ 57754dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 57764dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 57776bce7ff8SHong Zhang { 57786bce7ff8SHong Zhang 57796bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 57806bce7ff8SHong Zhang PetscErrorCode ierr; 578116a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 578235aa4fcfSShri Abhyankar PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 578335aa4fcfSShri Abhyankar 578435aa4fcfSShri Abhyankar PetscFunctionBegin; 578535aa4fcfSShri Abhyankar ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 578635aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 578735aa4fcfSShri Abhyankar 578835aa4fcfSShri Abhyankar /* allocate matrix arrays for new data structure */ 578935aa4fcfSShri Abhyankar ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 579035aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 579135aa4fcfSShri Abhyankar b->singlemalloc = PETSC_TRUE; 5792379be0ddSLisandro Dalcin b->free_a = PETSC_TRUE; 5793379be0ddSLisandro Dalcin b->free_ij = PETSC_TRUE; 57941e40a84eSLisandro Dalcin fact->preallocated = PETSC_TRUE; 57951e40a84eSLisandro Dalcin fact->assembled = PETSC_TRUE; 579635aa4fcfSShri Abhyankar if (!b->diag) { 579735aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 579835aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 579935aa4fcfSShri Abhyankar } 580035aa4fcfSShri Abhyankar bdiag = b->diag; 580135aa4fcfSShri Abhyankar 580235aa4fcfSShri Abhyankar if (n > 0) { 580335aa4fcfSShri Abhyankar ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 580435aa4fcfSShri Abhyankar } 580535aa4fcfSShri Abhyankar 580635aa4fcfSShri Abhyankar /* set bi and bj with new data structure */ 580735aa4fcfSShri Abhyankar bi = b->i; 580835aa4fcfSShri Abhyankar bj = b->j; 580935aa4fcfSShri Abhyankar 581035aa4fcfSShri Abhyankar /* L part */ 581135aa4fcfSShri Abhyankar bi[0] = 0; 581235aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 581335aa4fcfSShri Abhyankar nz = adiag[i] - ai[i]; 581435aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nz; 581535aa4fcfSShri Abhyankar aj = a->j + ai[i]; 581635aa4fcfSShri Abhyankar for (j=0; j<nz; j++) { 581735aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 581835aa4fcfSShri Abhyankar } 581935aa4fcfSShri Abhyankar } 582035aa4fcfSShri Abhyankar 582135aa4fcfSShri Abhyankar /* U part */ 582235aa4fcfSShri Abhyankar bi_temp = bi[n]; 582335aa4fcfSShri Abhyankar bdiag[n] = bi[n]-1; 582435aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--) { 582535aa4fcfSShri Abhyankar nz = ai[i+1] - adiag[i] - 1; 582635aa4fcfSShri Abhyankar bi_temp = bi_temp + nz + 1; 582735aa4fcfSShri Abhyankar aj = a->j + adiag[i] + 1; 582835aa4fcfSShri Abhyankar for (j=0; j<nz; j++) { 582935aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 583035aa4fcfSShri Abhyankar } 583135aa4fcfSShri Abhyankar /* diag[i] */ 583235aa4fcfSShri Abhyankar *bj = i; bj++; 583335aa4fcfSShri Abhyankar bdiag[i] = bi_temp - 1; 583435aa4fcfSShri Abhyankar } 583535aa4fcfSShri Abhyankar PetscFunctionReturn(0); 583635aa4fcfSShri Abhyankar } 583735aa4fcfSShri Abhyankar 583835aa4fcfSShri Abhyankar #undef __FUNCT__ 58394dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 58404dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 584116a2bf60SHong Zhang { 584216a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 584316a2bf60SHong Zhang IS isicol; 584416a2bf60SHong Zhang PetscErrorCode ierr; 584516a2bf60SHong Zhang const PetscInt *r,*ic; 58467fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 584716a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 584816a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 584916a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 5850ace3abfcSBarry Smith PetscBool col_identity,row_identity,both_identity; 585116a2bf60SHong Zhang PetscReal f; 585216a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 585316a2bf60SHong Zhang PetscBT lnkbt; 585416a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 585516a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 585616a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5857ace3abfcSBarry Smith PetscBool missing; 58587fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 585916a2bf60SHong Zhang 586016a2bf60SHong Zhang PetscFunctionBegin; 5861e32f2f54SBarry Smith if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 58626ba06ab7SHong Zhang if (bs>1) { /* check shifttype */ 58636ba06ab7SHong Zhang if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) 58646ba06ab7SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 58656ba06ab7SHong Zhang } 58666ba06ab7SHong Zhang 586716a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5868e32f2f54SBarry Smith if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 586916a2bf60SHong Zhang 587016a2bf60SHong Zhang f = info->fill; 587116a2bf60SHong Zhang levels = (PetscInt)info->levels; 587216a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 587316a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 587416a2bf60SHong Zhang 587516a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 587616a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5877ace3abfcSBarry Smith both_identity = (PetscBool) (row_identity && col_identity); 587816a2bf60SHong Zhang 58797fa3a6a0SHong Zhang if (!levels && both_identity) { 588016a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 58814dd39f65SShri Abhyankar ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 58824dd39f65SShri Abhyankar ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 588335aa4fcfSShri Abhyankar 5884d5f3da31SBarry Smith fact->factortype = MAT_FACTOR_ILU; 588535aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = 0; 588635aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = info->fill; 588735aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = 1.0; 588835aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 588935aa4fcfSShri Abhyankar b->row = isrow; 589035aa4fcfSShri Abhyankar b->col = iscol; 589135aa4fcfSShri Abhyankar b->icol = isicol; 589235aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 589335aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 589435aa4fcfSShri Abhyankar b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 589535aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 589635aa4fcfSShri Abhyankar PetscFunctionReturn(0); 589735aa4fcfSShri Abhyankar } 589835aa4fcfSShri Abhyankar 589935aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 590035aa4fcfSShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 590135aa4fcfSShri Abhyankar 590235aa4fcfSShri Abhyankar /* get new row pointers */ 590335aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 590435aa4fcfSShri Abhyankar bi[0] = 0; 590535aa4fcfSShri Abhyankar /* bdiag is location of diagonal in factor */ 590635aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 590735aa4fcfSShri Abhyankar bdiag[0] = 0; 590835aa4fcfSShri Abhyankar 5909fca92195SBarry Smith ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 591035aa4fcfSShri Abhyankar 591135aa4fcfSShri Abhyankar /* create a linked list for storing column indices of the active row */ 591235aa4fcfSShri Abhyankar nlnk = n + 1; 591335aa4fcfSShri Abhyankar ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 591435aa4fcfSShri Abhyankar 591535aa4fcfSShri Abhyankar /* initial FreeSpace size is f*(ai[n]+1) */ 591635aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 591735aa4fcfSShri Abhyankar current_space = free_space; 591835aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 591935aa4fcfSShri Abhyankar current_space_lvl = free_space_lvl; 592035aa4fcfSShri Abhyankar 592135aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 592235aa4fcfSShri Abhyankar nzi = 0; 592335aa4fcfSShri Abhyankar /* copy current row into linked list */ 592435aa4fcfSShri Abhyankar nnz = ai[r[i]+1] - ai[r[i]]; 5925e32f2f54SBarry Smith if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 592635aa4fcfSShri Abhyankar cols = aj + ai[r[i]]; 592735aa4fcfSShri Abhyankar lnk[i] = -1; /* marker to indicate if diagonal exists */ 592835aa4fcfSShri Abhyankar ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 592935aa4fcfSShri Abhyankar nzi += nlnk; 593035aa4fcfSShri Abhyankar 593135aa4fcfSShri Abhyankar /* make sure diagonal entry is included */ 593235aa4fcfSShri Abhyankar if (diagonal_fill && lnk[i] == -1) { 593335aa4fcfSShri Abhyankar fm = n; 593435aa4fcfSShri Abhyankar while (lnk[fm] < i) fm = lnk[fm]; 593535aa4fcfSShri Abhyankar lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 593635aa4fcfSShri Abhyankar lnk[fm] = i; 593735aa4fcfSShri Abhyankar lnk_lvl[i] = 0; 593835aa4fcfSShri Abhyankar nzi++; dcount++; 593935aa4fcfSShri Abhyankar } 594035aa4fcfSShri Abhyankar 594135aa4fcfSShri Abhyankar /* add pivot rows into the active row */ 594235aa4fcfSShri Abhyankar nzbd = 0; 594335aa4fcfSShri Abhyankar prow = lnk[n]; 594435aa4fcfSShri Abhyankar while (prow < i) { 594535aa4fcfSShri Abhyankar nnz = bdiag[prow]; 594635aa4fcfSShri Abhyankar cols = bj_ptr[prow] + nnz + 1; 594735aa4fcfSShri Abhyankar cols_lvl = bjlvl_ptr[prow] + nnz + 1; 594835aa4fcfSShri Abhyankar nnz = bi[prow+1] - bi[prow] - nnz - 1; 594935aa4fcfSShri Abhyankar ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 595035aa4fcfSShri Abhyankar nzi += nlnk; 595135aa4fcfSShri Abhyankar prow = lnk[prow]; 595235aa4fcfSShri Abhyankar nzbd++; 595335aa4fcfSShri Abhyankar } 595435aa4fcfSShri Abhyankar bdiag[i] = nzbd; 595535aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nzi; 595635aa4fcfSShri Abhyankar 595735aa4fcfSShri Abhyankar /* if free space is not available, make more free space */ 595835aa4fcfSShri Abhyankar if (current_space->local_remaining<nzi) { 595935aa4fcfSShri Abhyankar nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 596035aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 596135aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 596235aa4fcfSShri Abhyankar reallocs++; 596335aa4fcfSShri Abhyankar } 596435aa4fcfSShri Abhyankar 596535aa4fcfSShri Abhyankar /* copy data into free_space and free_space_lvl, then initialize lnk */ 596635aa4fcfSShri Abhyankar ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 596735aa4fcfSShri Abhyankar bj_ptr[i] = current_space->array; 596835aa4fcfSShri Abhyankar bjlvl_ptr[i] = current_space_lvl->array; 596935aa4fcfSShri Abhyankar 597035aa4fcfSShri Abhyankar /* make sure the active row i has diagonal entry */ 597165e19b50SBarry Smith if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 597235aa4fcfSShri Abhyankar 597335aa4fcfSShri Abhyankar current_space->array += nzi; 597435aa4fcfSShri Abhyankar current_space->local_used += nzi; 597535aa4fcfSShri Abhyankar current_space->local_remaining -= nzi; 597635aa4fcfSShri Abhyankar current_space_lvl->array += nzi; 597735aa4fcfSShri Abhyankar current_space_lvl->local_used += nzi; 597835aa4fcfSShri Abhyankar current_space_lvl->local_remaining -= nzi; 597935aa4fcfSShri Abhyankar } 598035aa4fcfSShri Abhyankar 598135aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 598235aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 598335aa4fcfSShri Abhyankar 598435aa4fcfSShri Abhyankar /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 59859263d837SHong Zhang ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 59862ce24eb6SHong Zhang ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 598735aa4fcfSShri Abhyankar 598835aa4fcfSShri Abhyankar ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 598935aa4fcfSShri Abhyankar ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5990fca92195SBarry Smith ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 599135aa4fcfSShri Abhyankar 599235aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO) 599335aa4fcfSShri Abhyankar { 5994aef85c9fSShri Abhyankar PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 599535aa4fcfSShri Abhyankar ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 599635aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 599735aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 599835aa4fcfSShri Abhyankar ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 599935aa4fcfSShri Abhyankar if (diagonal_fill) { 600035aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 600135aa4fcfSShri Abhyankar } 600235aa4fcfSShri Abhyankar } 600335aa4fcfSShri Abhyankar #endif 600435aa4fcfSShri Abhyankar 600535aa4fcfSShri Abhyankar /* put together the new matrix */ 600635aa4fcfSShri Abhyankar ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 600735aa4fcfSShri Abhyankar ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 600835aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 600935aa4fcfSShri Abhyankar b->free_a = PETSC_TRUE; 601035aa4fcfSShri Abhyankar b->free_ij = PETSC_TRUE; 601135aa4fcfSShri Abhyankar b->singlemalloc = PETSC_FALSE; 601235aa4fcfSShri Abhyankar ierr = PetscMalloc((bs2*(bdiag[0]+1))*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 601335aa4fcfSShri Abhyankar b->j = bj; 601435aa4fcfSShri Abhyankar b->i = bi; 601535aa4fcfSShri Abhyankar b->diag = bdiag; 601635aa4fcfSShri Abhyankar b->free_diag = PETSC_TRUE; 601735aa4fcfSShri Abhyankar b->ilen = 0; 601835aa4fcfSShri Abhyankar b->imax = 0; 601935aa4fcfSShri Abhyankar b->row = isrow; 602035aa4fcfSShri Abhyankar b->col = iscol; 602135aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 602235aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 602335aa4fcfSShri Abhyankar b->icol = isicol; 602435aa4fcfSShri Abhyankar ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 602535aa4fcfSShri Abhyankar /* In b structure: Free imax, ilen, old a, old j. 602635aa4fcfSShri Abhyankar Allocate bdiag, solve_work, new a, new j */ 602735aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 602835aa4fcfSShri Abhyankar b->maxnz = b->nz = bdiag[0]+1; 6029ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocs; 6030ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 6031ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 60324dd39f65SShri Abhyankar ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 603335aa4fcfSShri Abhyankar PetscFunctionReturn(0); 603435aa4fcfSShri Abhyankar } 603535aa4fcfSShri Abhyankar 60364e2b4712SSatish Balay /* 60374e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 60384e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 60394e2b4712SSatish Balay Not a good example of code reuse. 60404e2b4712SSatish Balay */ 60414a2ae208SSatish Balay #undef __FUNCT__ 604206e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 604306e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 60444e2b4712SSatish Balay { 60454e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 60464e2b4712SSatish Balay IS isicol; 60476849ba73SBarry Smith PetscErrorCode ierr; 60485d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 60495d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 6050a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 6051d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 6052ace3abfcSBarry Smith PetscBool col_identity,row_identity,both_identity,flg; 6053329f5518SBarry Smith PetscReal f; 60544e2b4712SSatish Balay 60554e2b4712SSatish Balay PetscFunctionBegin; 60566bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 6057e32f2f54SBarry Smith if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 60586bce7ff8SHong Zhang 6059435faa5fSBarry Smith f = info->fill; 6060690b6cddSBarry Smith levels = (PetscInt)info->levels; 6061690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 60624c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 606316a2bf60SHong Zhang 6064667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 6065667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 6066ace3abfcSBarry Smith both_identity = (PetscBool) (row_identity && col_identity); 6067309c388cSBarry Smith 606841df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 606916a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 60708b1456e3SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 60716bce7ff8SHong Zhang 6072d5f3da31SBarry Smith fact->factortype = MAT_FACTOR_ILU; 6073ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 6074bb3d539aSBarry Smith b->row = isrow; 6075bb3d539aSBarry Smith b->col = iscol; 6076bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6077bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6078bb3d539aSBarry Smith b->icol = isicol; 6079bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6080b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 60816bce7ff8SHong Zhang PetscFunctionReturn(0); 60826bce7ff8SHong Zhang } 60836bce7ff8SHong Zhang 60846bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 60854e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 60864e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 60874e2b4712SSatish Balay 60884e2b4712SSatish Balay /* get new row pointers */ 6089690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 60904e2b4712SSatish Balay ainew[0] = 0; 60914e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 6092690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 6093690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 60944e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 6095690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 60964e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 6097690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 60984e2b4712SSatish Balay /* im is level for each filled value */ 6099690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 61004e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 6101690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 61024e2b4712SSatish Balay dloc[0] = 0; 61034e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 6104435faa5fSBarry Smith 6105435faa5fSBarry Smith /* copy prow into linked list */ 61064e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 6107e32f2f54SBarry Smith if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 61084e2b4712SSatish Balay xi = aj + ai[r[prow]]; 61094e2b4712SSatish Balay fill[n] = n; 6110435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 61114e2b4712SSatish Balay while (nz--) { 61124e2b4712SSatish Balay fm = n; 61134e2b4712SSatish Balay idx = ic[*xi++]; 61144e2b4712SSatish Balay do { 61154e2b4712SSatish Balay m = fm; 61164e2b4712SSatish Balay fm = fill[m]; 61174e2b4712SSatish Balay } while (fm < idx); 61184e2b4712SSatish Balay fill[m] = idx; 61194e2b4712SSatish Balay fill[idx] = fm; 61204e2b4712SSatish Balay im[idx] = 0; 61214e2b4712SSatish Balay } 6122435faa5fSBarry Smith 6123435faa5fSBarry Smith /* make sure diagonal entry is included */ 6124435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 6125435faa5fSBarry Smith fm = n; 6126435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 6127435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 6128435faa5fSBarry Smith fill[fm] = prow; 6129435faa5fSBarry Smith im[prow] = 0; 6130435faa5fSBarry Smith nzf++; 6131335d9088SBarry Smith dcount++; 6132435faa5fSBarry Smith } 6133435faa5fSBarry Smith 61344e2b4712SSatish Balay nzi = 0; 61354e2b4712SSatish Balay row = fill[n]; 61364e2b4712SSatish Balay while (row < prow) { 61374e2b4712SSatish Balay incrlev = im[row] + 1; 61384e2b4712SSatish Balay nz = dloc[row]; 6139435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 61404e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 61414e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 61424e2b4712SSatish Balay fm = row; 61434e2b4712SSatish Balay while (nnz-- > 0) { 61444e2b4712SSatish Balay idx = *xi++; 61454e2b4712SSatish Balay if (*flev + incrlev > levels) { 61464e2b4712SSatish Balay flev++; 61474e2b4712SSatish Balay continue; 61484e2b4712SSatish Balay } 61494e2b4712SSatish Balay do { 61504e2b4712SSatish Balay m = fm; 61514e2b4712SSatish Balay fm = fill[m]; 61524e2b4712SSatish Balay } while (fm < idx); 61534e2b4712SSatish Balay if (fm != idx) { 61544e2b4712SSatish Balay im[idx] = *flev + incrlev; 61554e2b4712SSatish Balay fill[m] = idx; 61564e2b4712SSatish Balay fill[idx] = fm; 61574e2b4712SSatish Balay fm = idx; 61584e2b4712SSatish Balay nzf++; 6159ecf371e4SBarry Smith } else { 61604e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 61614e2b4712SSatish Balay } 61624e2b4712SSatish Balay flev++; 61634e2b4712SSatish Balay } 61644e2b4712SSatish Balay row = fill[row]; 61654e2b4712SSatish Balay nzi++; 61664e2b4712SSatish Balay } 61674e2b4712SSatish Balay /* copy new filled row into permanent storage */ 61684e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 61694e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 6170ecf371e4SBarry Smith 6171ecf371e4SBarry Smith /* estimate how much additional space we will need */ 6172ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 6173ecf371e4SBarry Smith /* just double the memory each time */ 6174690b6cddSBarry Smith PetscInt maxadd = jmax; 6175ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 61764e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 61774e2b4712SSatish Balay jmax += maxadd; 6178ecf371e4SBarry Smith 6179ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 61805d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 61815d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6182606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 61835d0c19d7SBarry Smith ajnew = xitmp; 61845d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 61855d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6186606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 61875d0c19d7SBarry Smith ajfill = xitmp; 6188eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 61894e2b4712SSatish Balay } 61905d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 61914e2b4712SSatish Balay flev = ajfill + ainew[prow]; 61924e2b4712SSatish Balay dloc[prow] = nzi; 61934e2b4712SSatish Balay fm = fill[n]; 61944e2b4712SSatish Balay while (nzf--) { 61955d0c19d7SBarry Smith *xitmp++ = fm; 61964e2b4712SSatish Balay *flev++ = im[fm]; 61974e2b4712SSatish Balay fm = fill[fm]; 61984e2b4712SSatish Balay } 6199435faa5fSBarry Smith /* make sure row has diagonal entry */ 6200435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 6201e32f2f54SBarry Smith SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 62022401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 6203435faa5fSBarry Smith } 62044e2b4712SSatish Balay } 6205606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 62064e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 62074e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6208606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 6209606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 62104e2b4712SSatish Balay 62116cf91177SBarry Smith #if defined(PETSC_USE_INFO) 62124e2b4712SSatish Balay { 6213329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 6214ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 6215ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6216ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 6217ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6218335d9088SBarry Smith if (diagonal_fill) { 6219ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 6220335d9088SBarry Smith } 62214e2b4712SSatish Balay } 622263ba0a88SBarry Smith #endif 62234e2b4712SSatish Balay 62244e2b4712SSatish Balay /* put together the new matrix */ 6225719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6226719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6227ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 6228e6b907acSBarry Smith b->free_a = PETSC_TRUE; 6229e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 62307c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 6231a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 62324e2b4712SSatish Balay b->j = ajnew; 62334e2b4712SSatish Balay b->i = ainew; 62344e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 62354e2b4712SSatish Balay b->diag = dloc; 62367f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 62374e2b4712SSatish Balay b->ilen = 0; 62384e2b4712SSatish Balay b->imax = 0; 62394e2b4712SSatish Balay b->row = isrow; 62404e2b4712SSatish Balay b->col = iscol; 6241bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6242c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6243c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6244e51c0b9cSSatish Balay b->icol = isicol; 624587828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 62464e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 62474e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 6248719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 62494e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 62504e2b4712SSatish Balay 6251ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocate; 6252ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 6253ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 62546bce7ff8SHong Zhang 62558b1456e3SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 62568661488fSKris Buschelman PetscFunctionReturn(0); 62578661488fSKris Buschelman } 62588661488fSKris Buschelman 6259732ee342SKris Buschelman #undef __FUNCT__ 62607e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 6261dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 62627e7071cdSKris Buschelman { 626312272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 626412272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 62655a9542e3SKris Buschelman PetscFunctionBegin; 62667cf1b8d3SKris Buschelman /* Undo Column scaling */ 62677cf1b8d3SKris Buschelman /* while (nz--) { */ 62687cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 62697cf1b8d3SKris Buschelman /* } */ 6270c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 6271c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 62727cf1b8d3SKris Buschelman PetscFunctionReturn(0); 62737cf1b8d3SKris Buschelman } 62747cf1b8d3SKris Buschelman 62757cf1b8d3SKris Buschelman #undef __FUNCT__ 62767cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 6277dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 62787cf1b8d3SKris Buschelman { 62797cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6280b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 62812aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 62825a9542e3SKris Buschelman PetscFunctionBegin; 62830b9da03eSKris Buschelman /* Is this really necessary? */ 628420235379SKris Buschelman while (nz--) { 62850b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 62867e7071cdSKris Buschelman } 6287c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 62887e7071cdSKris Buschelman PetscFunctionReturn(0); 62897e7071cdSKris Buschelman } 62907e7071cdSKris Buschelman 6291732ee342SKris Buschelman 6292