1be1d678aSKris Buschelman 24e2b4712SSatish Balay /* 34e2b4712SSatish Balay Factorization code for BAIJ format. 44e2b4712SSatish Balay */ 54e2b4712SSatish Balay 6c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 7c6db04a5SJed Brown #include <../src/mat/blockinvert.h> 8c6db04a5SJed Brown #include <petscbt.h> 9c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 104e2b4712SSatish Balay 114a2ae208SSatish Balay #undef __FUNCT__ 1293fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 1393fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 1493fd935bSShri Abhyankar { 1593fd935bSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1693fd935bSShri Abhyankar PetscErrorCode ierr; 1793fd935bSShri Abhyankar const PetscInt *adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 1893fd935bSShri Abhyankar PetscInt i,n = a->mbs,j; 1993fd935bSShri Abhyankar PetscInt nz; 2093fd935bSShri Abhyankar PetscScalar *x,*tmp,s1; 2193fd935bSShri Abhyankar const MatScalar *aa = a->a,*v; 2293fd935bSShri Abhyankar const PetscScalar *b; 2393fd935bSShri Abhyankar 2493fd935bSShri Abhyankar PetscFunctionBegin; 253649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2693fd935bSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2793fd935bSShri Abhyankar tmp = a->solve_work; 2893fd935bSShri Abhyankar 2993fd935bSShri Abhyankar 3093fd935bSShri Abhyankar /* copy the b into temp work space according to permutation */ 3193fd935bSShri Abhyankar for (i=0; i<n; i++) tmp[i] = b[i]; 3293fd935bSShri Abhyankar 3393fd935bSShri Abhyankar /* forward solve the U^T */ 3493fd935bSShri Abhyankar for (i=0; i<n; i++) { 3593fd935bSShri Abhyankar v = aa + adiag[i+1] + 1; 3693fd935bSShri Abhyankar vi = aj + adiag[i+1] + 1; 3793fd935bSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 3893fd935bSShri Abhyankar s1 = tmp[i]; 3993fd935bSShri Abhyankar s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 4093fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 4193fd935bSShri Abhyankar tmp[i] = s1; 4293fd935bSShri Abhyankar } 4393fd935bSShri Abhyankar 4493fd935bSShri Abhyankar /* backward solve the L^T */ 4593fd935bSShri Abhyankar for (i=n-1; i>=0; i--) { 4693fd935bSShri Abhyankar v = aa + ai[i]; 4793fd935bSShri Abhyankar vi = aj + ai[i]; 4893fd935bSShri Abhyankar nz = ai[i+1] - ai[i]; 4993fd935bSShri Abhyankar s1 = tmp[i]; 5093fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 5193fd935bSShri Abhyankar } 5293fd935bSShri Abhyankar 5393fd935bSShri Abhyankar /* copy tmp into x according to permutation */ 5493fd935bSShri Abhyankar for (i=0; i<n; i++) x[i] = tmp[i]; 5593fd935bSShri Abhyankar 563649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5793fd935bSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5893fd935bSShri Abhyankar 5993fd935bSShri Abhyankar ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 6093fd935bSShri Abhyankar PetscFunctionReturn(0); 6193fd935bSShri Abhyankar } 6293fd935bSShri Abhyankar 6393fd935bSShri Abhyankar #undef __FUNCT__ 6406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 6506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 66f1af5d2fSBarry Smith { 67f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 68dfbe8321SBarry Smith PetscErrorCode ierr; 690b68f018SBarry Smith PetscInt i,nz; 700b68f018SBarry Smith const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 710b68f018SBarry Smith const MatScalar *aa =a->a,*v; 720b68f018SBarry Smith PetscScalar s1,*x; 73f1af5d2fSBarry Smith 74f1af5d2fSBarry Smith PetscFunctionBegin; 75ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 761ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 77f1af5d2fSBarry Smith 78f1af5d2fSBarry Smith /* forward solve the U^T */ 79f1af5d2fSBarry Smith for (i=0; i<n; i++) { 80f1af5d2fSBarry Smith 81f1af5d2fSBarry Smith v = aa + diag[i]; 82f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 83ef66eb69SBarry Smith s1 = (*v++)*x[i]; 84f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 85f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 86f1af5d2fSBarry Smith while (nz--) { 87f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 88f1af5d2fSBarry Smith } 89f1af5d2fSBarry Smith x[i] = s1; 90f1af5d2fSBarry Smith } 91f1af5d2fSBarry Smith /* backward solve the L^T */ 92f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 93f1af5d2fSBarry Smith v = aa + diag[i] - 1; 94f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 95f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 96f1af5d2fSBarry Smith s1 = x[i]; 97f1af5d2fSBarry Smith while (nz--) { 98f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 99f1af5d2fSBarry Smith } 100f1af5d2fSBarry Smith } 1011ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 102dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 103f1af5d2fSBarry Smith PetscFunctionReturn(0); 104f1af5d2fSBarry Smith } 105f1af5d2fSBarry Smith 1064a2ae208SSatish Balay #undef __FUNCT__ 10706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 10806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 109f1af5d2fSBarry Smith { 110f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 111dfbe8321SBarry Smith PetscErrorCode ierr; 112b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 113b3260449SShri Abhyankar const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 114b3260449SShri Abhyankar const MatScalar *aa =a->a,*v; 115b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x; 116f1af5d2fSBarry Smith 117f1af5d2fSBarry Smith PetscFunctionBegin; 118ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1191ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 120f1af5d2fSBarry Smith 121f1af5d2fSBarry Smith /* forward solve the U^T */ 122f1af5d2fSBarry Smith idx = 0; 123f1af5d2fSBarry Smith for (i=0; i<n; i++) { 124f1af5d2fSBarry Smith 125f1af5d2fSBarry Smith v = aa + 4*diag[i]; 126f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 127ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 128f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 129f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 130f1af5d2fSBarry Smith v += 4; 131f1af5d2fSBarry Smith 132f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 133f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 134f1af5d2fSBarry Smith while (nz--) { 135f1af5d2fSBarry Smith oidx = 2*(*vi++); 136f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 137f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 138f1af5d2fSBarry Smith v += 4; 139f1af5d2fSBarry Smith } 140f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 141f1af5d2fSBarry Smith idx += 2; 142f1af5d2fSBarry Smith } 143f1af5d2fSBarry Smith /* backward solve the L^T */ 144f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 145f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 146f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 147f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 148f1af5d2fSBarry Smith idt = 2*i; 149f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 150f1af5d2fSBarry Smith while (nz--) { 151f1af5d2fSBarry Smith idx = 2*(*vi--); 152f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 153f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 154f1af5d2fSBarry Smith v -= 4; 155f1af5d2fSBarry Smith } 156f1af5d2fSBarry Smith } 1571ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 158dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 159f1af5d2fSBarry Smith PetscFunctionReturn(0); 160f1af5d2fSBarry Smith } 161f1af5d2fSBarry Smith 1624a2ae208SSatish Balay #undef __FUNCT__ 1634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 1644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 1656929473cSShri Abhyankar { 1666929473cSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 1676929473cSShri Abhyankar PetscErrorCode ierr; 168b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1696929473cSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 170b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 171b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 172b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x; 1736929473cSShri Abhyankar 1746929473cSShri Abhyankar PetscFunctionBegin; 1756929473cSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1766929473cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1776929473cSShri Abhyankar 1786929473cSShri Abhyankar /* forward solve the U^T */ 1796929473cSShri Abhyankar idx = 0; 1806929473cSShri Abhyankar for (i=0; i<n; i++) { 1816929473cSShri Abhyankar v = aa + bs2*diag[i]; 1826929473cSShri Abhyankar /* multiply by the inverse of the block diagonal */ 1836929473cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 1846929473cSShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 1856929473cSShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 1866929473cSShri Abhyankar v -= bs2; 1876929473cSShri Abhyankar 1886929473cSShri Abhyankar vi = aj + diag[i] - 1; 1896929473cSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 1906929473cSShri Abhyankar for (j=0; j>-nz; j--) { 1916929473cSShri Abhyankar oidx = bs*vi[j]; 1926929473cSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2; 1936929473cSShri Abhyankar x[oidx+1] -= v[2]*s1 + v[3]*s2; 1946929473cSShri Abhyankar v -= bs2; 1956929473cSShri Abhyankar } 1966929473cSShri Abhyankar x[idx] = s1;x[1+idx] = s2; 1976929473cSShri Abhyankar idx += bs; 1986929473cSShri Abhyankar } 1996929473cSShri Abhyankar /* backward solve the L^T */ 2006929473cSShri Abhyankar for (i=n-1; i>=0; i--) { 2016929473cSShri Abhyankar v = aa + bs2*ai[i]; 2026929473cSShri Abhyankar vi = aj + ai[i]; 2036929473cSShri Abhyankar nz = ai[i+1] - ai[i]; 2046929473cSShri Abhyankar idt = bs*i; 2056929473cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 2066929473cSShri Abhyankar for (j=0; j<nz; j++) { 2076929473cSShri Abhyankar idx = bs*vi[j]; 2086929473cSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2; 2096929473cSShri Abhyankar x[idx+1] -= v[2]*s1 + v[3]*s2; 2106929473cSShri Abhyankar v += bs2; 2116929473cSShri Abhyankar } 2126929473cSShri Abhyankar } 2136929473cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2146929473cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2156929473cSShri Abhyankar PetscFunctionReturn(0); 2166929473cSShri Abhyankar } 2176929473cSShri Abhyankar 2186929473cSShri Abhyankar #undef __FUNCT__ 21906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 22006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 221f1af5d2fSBarry Smith { 222f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 223dfbe8321SBarry Smith PetscErrorCode ierr; 224b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 225b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 226b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 227b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x; 228f1af5d2fSBarry Smith 229f1af5d2fSBarry Smith PetscFunctionBegin; 230ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2311ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 232f1af5d2fSBarry Smith 233f1af5d2fSBarry Smith /* forward solve the U^T */ 234f1af5d2fSBarry Smith idx = 0; 235f1af5d2fSBarry Smith for (i=0; i<n; i++) { 236f1af5d2fSBarry Smith 237f1af5d2fSBarry Smith v = aa + 9*diag[i]; 238f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 239ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 240f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 241f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 242f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 243f1af5d2fSBarry Smith v += 9; 244f1af5d2fSBarry Smith 245f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 246f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 247f1af5d2fSBarry Smith while (nz--) { 248f1af5d2fSBarry Smith oidx = 3*(*vi++); 249f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 250f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 251f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 252f1af5d2fSBarry Smith v += 9; 253f1af5d2fSBarry Smith } 254f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 255f1af5d2fSBarry Smith idx += 3; 256f1af5d2fSBarry Smith } 257f1af5d2fSBarry Smith /* backward solve the L^T */ 258f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 259f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 260f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 261f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 262f1af5d2fSBarry Smith idt = 3*i; 263f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 264f1af5d2fSBarry Smith while (nz--) { 265f1af5d2fSBarry Smith idx = 3*(*vi--); 266f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 267f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 268f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 269f1af5d2fSBarry Smith v -= 9; 270f1af5d2fSBarry Smith } 271f1af5d2fSBarry Smith } 2721ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 273dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 274f1af5d2fSBarry Smith PetscFunctionReturn(0); 275f1af5d2fSBarry Smith } 276f1af5d2fSBarry Smith 2774a2ae208SSatish Balay #undef __FUNCT__ 2784dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 2794dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2808499736aSShri Abhyankar { 2818499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 2828499736aSShri Abhyankar PetscErrorCode ierr; 283b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 2848499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 285b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 286b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 287b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x; 2888499736aSShri Abhyankar 2898499736aSShri Abhyankar PetscFunctionBegin; 2908499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2918499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2928499736aSShri Abhyankar 2938499736aSShri Abhyankar /* forward solve the U^T */ 2948499736aSShri Abhyankar idx = 0; 2958499736aSShri Abhyankar for (i=0; i<n; i++) { 2968499736aSShri Abhyankar v = aa + bs2*diag[i]; 2978499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 2988499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2998499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 3008499736aSShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 3018499736aSShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 3028499736aSShri Abhyankar v -= bs2; 3038499736aSShri Abhyankar 3048499736aSShri Abhyankar vi = aj + diag[i] - 1; 3058499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 3068499736aSShri Abhyankar for (j=0; j>-nz; j--) { 3078499736aSShri Abhyankar oidx = bs*vi[j]; 3088499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 3098499736aSShri Abhyankar x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 3108499736aSShri Abhyankar x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 3118499736aSShri Abhyankar v -= bs2; 3128499736aSShri Abhyankar } 3138499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 3148499736aSShri Abhyankar idx += bs; 3158499736aSShri Abhyankar } 3168499736aSShri Abhyankar /* backward solve the L^T */ 3178499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 3188499736aSShri Abhyankar v = aa + bs2*ai[i]; 3198499736aSShri Abhyankar vi = aj + ai[i]; 3208499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 3218499736aSShri Abhyankar idt = bs*i; 3228499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 3238499736aSShri Abhyankar for (j=0; j<nz; j++) { 3248499736aSShri Abhyankar idx = bs*vi[j]; 3258499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 3268499736aSShri Abhyankar x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 3278499736aSShri Abhyankar x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 3288499736aSShri Abhyankar v += bs2; 3298499736aSShri Abhyankar } 3308499736aSShri Abhyankar } 3318499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3328499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3338499736aSShri Abhyankar PetscFunctionReturn(0); 3348499736aSShri Abhyankar } 3358499736aSShri Abhyankar 3368499736aSShri Abhyankar #undef __FUNCT__ 33706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 33806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 339f1af5d2fSBarry Smith { 340f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 341dfbe8321SBarry Smith PetscErrorCode ierr; 342b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 343b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 344b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 345b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 346f1af5d2fSBarry Smith 347f1af5d2fSBarry Smith PetscFunctionBegin; 348ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3491ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 350f1af5d2fSBarry Smith 351f1af5d2fSBarry Smith /* forward solve the U^T */ 352f1af5d2fSBarry Smith idx = 0; 353f1af5d2fSBarry Smith for (i=0; i<n; i++) { 354f1af5d2fSBarry Smith 355f1af5d2fSBarry Smith v = aa + 16*diag[i]; 356f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 357ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 358f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 359f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 360f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 361f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 362f1af5d2fSBarry Smith v += 16; 363f1af5d2fSBarry Smith 364f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 365f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 366f1af5d2fSBarry Smith while (nz--) { 367f1af5d2fSBarry Smith oidx = 4*(*vi++); 368f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 369f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 370f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 371f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 372f1af5d2fSBarry Smith v += 16; 373f1af5d2fSBarry Smith } 374f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 375f1af5d2fSBarry Smith idx += 4; 376f1af5d2fSBarry Smith } 377f1af5d2fSBarry Smith /* backward solve the L^T */ 378f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 379f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 380f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 381f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 382f1af5d2fSBarry Smith idt = 4*i; 383f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 384f1af5d2fSBarry Smith while (nz--) { 385f1af5d2fSBarry Smith idx = 4*(*vi--); 386f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 387f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 388f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 389f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 390f1af5d2fSBarry Smith v -= 16; 391f1af5d2fSBarry Smith } 392f1af5d2fSBarry Smith } 3931ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 394dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 395f1af5d2fSBarry Smith PetscFunctionReturn(0); 396f1af5d2fSBarry Smith } 397f1af5d2fSBarry Smith 3984a2ae208SSatish Balay #undef __FUNCT__ 3994dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 4004dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4018499736aSShri Abhyankar { 4028499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 4038499736aSShri Abhyankar PetscErrorCode ierr; 404b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 4058499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 406b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 407b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 408b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 4098499736aSShri Abhyankar 4108499736aSShri Abhyankar PetscFunctionBegin; 4118499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4128499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4138499736aSShri Abhyankar 4148499736aSShri Abhyankar /* forward solve the U^T */ 4158499736aSShri Abhyankar idx = 0; 4168499736aSShri Abhyankar for (i=0; i<n; i++) { 4178499736aSShri Abhyankar v = aa + bs2*diag[i]; 4188499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 4198499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 4208499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 4218499736aSShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 4228499736aSShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 4238499736aSShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 4248499736aSShri Abhyankar v -= bs2; 4258499736aSShri Abhyankar 4268499736aSShri Abhyankar vi = aj + diag[i] - 1; 4278499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 4288499736aSShri Abhyankar for (j=0; j>-nz; j--) { 4298499736aSShri Abhyankar oidx = bs*vi[j]; 4308499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4318499736aSShri Abhyankar x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4328499736aSShri Abhyankar x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4338499736aSShri Abhyankar x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4348499736aSShri Abhyankar v -= bs2; 4358499736aSShri Abhyankar } 4368499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 4378499736aSShri Abhyankar idx += bs; 4388499736aSShri Abhyankar } 4398499736aSShri Abhyankar /* backward solve the L^T */ 4408499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 4418499736aSShri Abhyankar v = aa + bs2*ai[i]; 4428499736aSShri Abhyankar vi = aj + ai[i]; 4438499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 4448499736aSShri Abhyankar idt = bs*i; 4458499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 4468499736aSShri Abhyankar for (j=0; j<nz; j++) { 4478499736aSShri Abhyankar idx = bs*vi[j]; 4488499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4498499736aSShri Abhyankar x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4508499736aSShri Abhyankar x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4518499736aSShri Abhyankar x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4528499736aSShri Abhyankar v += bs2; 4538499736aSShri Abhyankar } 4548499736aSShri Abhyankar } 4558499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4568499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4578499736aSShri Abhyankar PetscFunctionReturn(0); 4588499736aSShri Abhyankar } 4598499736aSShri Abhyankar 4608499736aSShri Abhyankar #undef __FUNCT__ 46106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 46206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 463f1af5d2fSBarry Smith { 464f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 465dfbe8321SBarry Smith PetscErrorCode ierr; 466b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 467b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 468b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 469b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 470f1af5d2fSBarry Smith 471f1af5d2fSBarry Smith PetscFunctionBegin; 472ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 474f1af5d2fSBarry Smith 475f1af5d2fSBarry Smith /* forward solve the U^T */ 476f1af5d2fSBarry Smith idx = 0; 477f1af5d2fSBarry Smith for (i=0; i<n; i++) { 478f1af5d2fSBarry Smith 479f1af5d2fSBarry Smith v = aa + 25*diag[i]; 480f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 481ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 482f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 483f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 484f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 485f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 486f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 487f1af5d2fSBarry Smith v += 25; 488f1af5d2fSBarry Smith 489f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 490f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 491f1af5d2fSBarry Smith while (nz--) { 492f1af5d2fSBarry Smith oidx = 5*(*vi++); 493f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 494f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 495f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 496f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 497f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 498f1af5d2fSBarry Smith v += 25; 499f1af5d2fSBarry Smith } 500f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 501f1af5d2fSBarry Smith idx += 5; 502f1af5d2fSBarry Smith } 503f1af5d2fSBarry Smith /* backward solve the L^T */ 504f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 505f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 506f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 507f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 508f1af5d2fSBarry Smith idt = 5*i; 509f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 510f1af5d2fSBarry Smith while (nz--) { 511f1af5d2fSBarry Smith idx = 5*(*vi--); 512f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 513f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 514f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 515f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 516f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 517f1af5d2fSBarry Smith v -= 25; 518f1af5d2fSBarry Smith } 519f1af5d2fSBarry Smith } 5201ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 521dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 522f1af5d2fSBarry Smith PetscFunctionReturn(0); 523f1af5d2fSBarry Smith } 524f1af5d2fSBarry Smith 5254a2ae208SSatish Balay #undef __FUNCT__ 5264dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 5274dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 5288499736aSShri Abhyankar { 5298499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 5308499736aSShri Abhyankar PetscErrorCode ierr; 531b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5328499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 533b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 534b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 535b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 5368499736aSShri Abhyankar 5378499736aSShri Abhyankar PetscFunctionBegin; 5388499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 5398499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5408499736aSShri Abhyankar 5418499736aSShri Abhyankar /* forward solve the U^T */ 5428499736aSShri Abhyankar idx = 0; 5438499736aSShri Abhyankar for (i=0; i<n; i++) { 5448499736aSShri Abhyankar v = aa + bs2*diag[i]; 5458499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 5468499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 5478499736aSShri Abhyankar x5 = x[4+idx]; 5488499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 5498499736aSShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 5508499736aSShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 5518499736aSShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 5528499736aSShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 5538499736aSShri Abhyankar v -= bs2; 5548499736aSShri Abhyankar 5558499736aSShri Abhyankar vi = aj + diag[i] - 1; 5568499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 5578499736aSShri Abhyankar for (j=0; j>-nz; j--) { 5588499736aSShri Abhyankar oidx = bs*vi[j]; 5598499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 5608499736aSShri Abhyankar x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 5618499736aSShri Abhyankar x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 5628499736aSShri Abhyankar x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 5638499736aSShri Abhyankar x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 5648499736aSShri Abhyankar v -= bs2; 5658499736aSShri Abhyankar } 5668499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 5678499736aSShri Abhyankar idx += bs; 5688499736aSShri Abhyankar } 5698499736aSShri Abhyankar /* backward solve the L^T */ 5708499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 5718499736aSShri Abhyankar v = aa + bs2*ai[i]; 5728499736aSShri Abhyankar vi = aj + ai[i]; 5738499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 5748499736aSShri Abhyankar idt = bs*i; 5758499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 5768499736aSShri Abhyankar for (j=0; j<nz; j++) { 5778499736aSShri Abhyankar idx = bs*vi[j]; 5788499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 5798499736aSShri Abhyankar x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 5808499736aSShri Abhyankar x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 5818499736aSShri Abhyankar x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 5828499736aSShri Abhyankar x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 5838499736aSShri Abhyankar v += bs2; 5848499736aSShri Abhyankar } 5858499736aSShri Abhyankar } 5868499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5878499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5888499736aSShri Abhyankar PetscFunctionReturn(0); 5898499736aSShri Abhyankar } 5908499736aSShri Abhyankar 5918499736aSShri Abhyankar #undef __FUNCT__ 59206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 59306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 594f1af5d2fSBarry Smith { 595f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 596dfbe8321SBarry Smith PetscErrorCode ierr; 597b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 598b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 599b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 600b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 601f1af5d2fSBarry Smith 602f1af5d2fSBarry Smith PetscFunctionBegin; 603ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 6041ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 605f1af5d2fSBarry Smith 606f1af5d2fSBarry Smith /* forward solve the U^T */ 607f1af5d2fSBarry Smith idx = 0; 608f1af5d2fSBarry Smith for (i=0; i<n; i++) { 609f1af5d2fSBarry Smith 610f1af5d2fSBarry Smith v = aa + 36*diag[i]; 611f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 612ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 613ef66eb69SBarry Smith x6 = x[5+idx]; 614f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 615f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 616f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 617f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 618f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 619f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 620f1af5d2fSBarry Smith v += 36; 621f1af5d2fSBarry Smith 622f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 623f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 624f1af5d2fSBarry Smith while (nz--) { 625f1af5d2fSBarry Smith oidx = 6*(*vi++); 626f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632f1af5d2fSBarry Smith v += 36; 633f1af5d2fSBarry Smith } 634f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 635f1af5d2fSBarry Smith x[5+idx] = s6; 636f1af5d2fSBarry Smith idx += 6; 637f1af5d2fSBarry Smith } 638f1af5d2fSBarry Smith /* backward solve the L^T */ 639f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 640f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 641f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 642f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 643f1af5d2fSBarry Smith idt = 6*i; 644f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 645f1af5d2fSBarry Smith s6 = x[5+idt]; 646f1af5d2fSBarry Smith while (nz--) { 647f1af5d2fSBarry Smith idx = 6*(*vi--); 648f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 649f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 650f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 651f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 652f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 653f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 654f1af5d2fSBarry Smith v -= 36; 655f1af5d2fSBarry Smith } 656f1af5d2fSBarry Smith } 6571ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 658dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 659f1af5d2fSBarry Smith PetscFunctionReturn(0); 660f1af5d2fSBarry Smith } 661f1af5d2fSBarry Smith 6624a2ae208SSatish Balay #undef __FUNCT__ 6634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 6644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 6658499736aSShri Abhyankar { 6668499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 6678499736aSShri Abhyankar PetscErrorCode ierr; 668b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 6698499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 670b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 671b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 672b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 6738499736aSShri Abhyankar 6748499736aSShri Abhyankar PetscFunctionBegin; 6758499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 6768499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 6778499736aSShri Abhyankar 6788499736aSShri Abhyankar /* forward solve the U^T */ 6798499736aSShri Abhyankar idx = 0; 6808499736aSShri Abhyankar for (i=0; i<n; i++) { 6818499736aSShri Abhyankar v = aa + bs2*diag[i]; 6828499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 6838499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 6848499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; 6858499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 6868499736aSShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 6878499736aSShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 6888499736aSShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 6898499736aSShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 6908499736aSShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 6918499736aSShri Abhyankar v -= bs2; 6928499736aSShri Abhyankar 6938499736aSShri Abhyankar vi = aj + diag[i] - 1; 6948499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 6958499736aSShri Abhyankar for (j=0; j>-nz; j--) { 6968499736aSShri Abhyankar oidx = bs*vi[j]; 6978499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 6988499736aSShri Abhyankar x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 6998499736aSShri Abhyankar x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 7008499736aSShri Abhyankar x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 7018499736aSShri Abhyankar x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 7028499736aSShri Abhyankar x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 7038499736aSShri Abhyankar v -= bs2; 7048499736aSShri Abhyankar } 7058499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 7068499736aSShri Abhyankar x[5+idx] = s6; 7078499736aSShri Abhyankar idx += bs; 7088499736aSShri Abhyankar } 7098499736aSShri Abhyankar /* backward solve the L^T */ 7108499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 7118499736aSShri Abhyankar v = aa + bs2*ai[i]; 7128499736aSShri Abhyankar vi = aj + ai[i]; 7138499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 7148499736aSShri Abhyankar idt = bs*i; 7158499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 7168499736aSShri Abhyankar s6 = x[5+idt]; 7178499736aSShri Abhyankar for (j=0; j<nz; j++) { 7188499736aSShri Abhyankar idx = bs*vi[j]; 7198499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 7208499736aSShri Abhyankar x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 7218499736aSShri Abhyankar x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 7228499736aSShri Abhyankar x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 7238499736aSShri Abhyankar x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 7248499736aSShri Abhyankar x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 7258499736aSShri Abhyankar v += bs2; 7268499736aSShri Abhyankar } 7278499736aSShri Abhyankar } 7288499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 7298499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 7308499736aSShri Abhyankar PetscFunctionReturn(0); 7318499736aSShri Abhyankar } 7328499736aSShri Abhyankar 7338499736aSShri Abhyankar #undef __FUNCT__ 73406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 73506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 736f1af5d2fSBarry Smith { 737f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 738dfbe8321SBarry Smith PetscErrorCode ierr; 739b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 740b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 741b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 742b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 743f1af5d2fSBarry Smith 744f1af5d2fSBarry Smith PetscFunctionBegin; 745ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 7461ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 747f1af5d2fSBarry Smith 748f1af5d2fSBarry Smith /* forward solve the U^T */ 749f1af5d2fSBarry Smith idx = 0; 750f1af5d2fSBarry Smith for (i=0; i<n; i++) { 751f1af5d2fSBarry Smith 752f1af5d2fSBarry Smith v = aa + 49*diag[i]; 753f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 754ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 755ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 756f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 757f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 758f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 759f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 760f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 761f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 762f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 763f1af5d2fSBarry Smith v += 49; 764f1af5d2fSBarry Smith 765f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 766f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 767f1af5d2fSBarry Smith while (nz--) { 768f1af5d2fSBarry Smith oidx = 7*(*vi++); 769f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 770f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 771f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 772f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 773f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 774f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 775f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 776f1af5d2fSBarry Smith v += 49; 777f1af5d2fSBarry Smith } 778f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 779f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 780f1af5d2fSBarry Smith idx += 7; 781f1af5d2fSBarry Smith } 782f1af5d2fSBarry Smith /* backward solve the L^T */ 783f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 784f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 785f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 786f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 787f1af5d2fSBarry Smith idt = 7*i; 788f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 789f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 790f1af5d2fSBarry Smith while (nz--) { 791f1af5d2fSBarry Smith idx = 7*(*vi--); 792f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 793f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 794f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 795f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 796f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 797f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 798f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 799f1af5d2fSBarry Smith v -= 49; 800f1af5d2fSBarry Smith } 801f1af5d2fSBarry Smith } 8021ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 803dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 804f1af5d2fSBarry Smith PetscFunctionReturn(0); 805f1af5d2fSBarry Smith } 8068499736aSShri Abhyankar #undef __FUNCT__ 8074dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 8084dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 8098499736aSShri Abhyankar { 8108499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 8118499736aSShri Abhyankar PetscErrorCode ierr; 812b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 8138499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 814b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 815b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 816b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 8178499736aSShri Abhyankar 8188499736aSShri Abhyankar PetscFunctionBegin; 8198499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 8208499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 8218499736aSShri Abhyankar 8228499736aSShri Abhyankar /* forward solve the U^T */ 8238499736aSShri Abhyankar idx = 0; 8248499736aSShri Abhyankar for (i=0; i<n; i++) { 8258499736aSShri Abhyankar v = aa + bs2*diag[i]; 8268499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 8278499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 8288499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 8298499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 8308499736aSShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 8318499736aSShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 8328499736aSShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 8338499736aSShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 8348499736aSShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 8358499736aSShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 8368499736aSShri Abhyankar v -= bs2; 8378499736aSShri Abhyankar vi = aj + diag[i] - 1; 8388499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 8398499736aSShri Abhyankar for (j=0; j>-nz; j--) { 8408499736aSShri Abhyankar oidx = bs*vi[j]; 8418499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 8428499736aSShri Abhyankar x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 8438499736aSShri Abhyankar x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 8448499736aSShri Abhyankar x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 8458499736aSShri Abhyankar x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 8468499736aSShri Abhyankar x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 8478499736aSShri Abhyankar x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 8488499736aSShri Abhyankar v -= bs2; 8498499736aSShri Abhyankar } 8508499736aSShri Abhyankar x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 8518499736aSShri Abhyankar x[5+idx] = s6; x[6+idx] = s7; 8528499736aSShri Abhyankar idx += bs; 8538499736aSShri Abhyankar } 8548499736aSShri Abhyankar /* backward solve the L^T */ 8558499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 8568499736aSShri Abhyankar v = aa + bs2*ai[i]; 8578499736aSShri Abhyankar vi = aj + ai[i]; 8588499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 8598499736aSShri Abhyankar idt = bs*i; 8608499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 8618499736aSShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; 8628499736aSShri Abhyankar for (j=0; j<nz; j++) { 8638499736aSShri Abhyankar idx = bs*vi[j]; 8648499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 8658499736aSShri Abhyankar x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 8668499736aSShri Abhyankar x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 8678499736aSShri Abhyankar x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 8688499736aSShri Abhyankar x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 8698499736aSShri Abhyankar x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 8708499736aSShri Abhyankar x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 8718499736aSShri Abhyankar v += bs2; 8728499736aSShri Abhyankar } 8738499736aSShri Abhyankar } 8748499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 8758499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 8768499736aSShri Abhyankar PetscFunctionReturn(0); 8778499736aSShri Abhyankar } 878f1af5d2fSBarry Smith 879f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 8804a2ae208SSatish Balay #undef __FUNCT__ 88193fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 88293fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 88393fd935bSShri Abhyankar { 88493fd935bSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 88593fd935bSShri Abhyankar IS iscol = a->col,isrow = a->row; 88693fd935bSShri Abhyankar PetscErrorCode ierr; 88793fd935bSShri Abhyankar const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 88893fd935bSShri Abhyankar PetscInt i,n = a->mbs,j; 88993fd935bSShri Abhyankar PetscInt nz; 89093fd935bSShri Abhyankar PetscScalar *x,*tmp,s1; 89193fd935bSShri Abhyankar const MatScalar *aa = a->a,*v; 89293fd935bSShri Abhyankar const PetscScalar *b; 89393fd935bSShri Abhyankar 89493fd935bSShri Abhyankar PetscFunctionBegin; 8953649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 89693fd935bSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 89793fd935bSShri Abhyankar tmp = a->solve_work; 89893fd935bSShri Abhyankar 89993fd935bSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 90093fd935bSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 90193fd935bSShri Abhyankar 90293fd935bSShri Abhyankar /* copy the b into temp work space according to permutation */ 90393fd935bSShri Abhyankar for (i=0; i<n; i++) tmp[i] = b[c[i]]; 90493fd935bSShri Abhyankar 90593fd935bSShri Abhyankar /* forward solve the U^T */ 90693fd935bSShri Abhyankar for (i=0; i<n; i++) { 90793fd935bSShri Abhyankar v = aa + adiag[i+1] + 1; 90893fd935bSShri Abhyankar vi = aj + adiag[i+1] + 1; 90993fd935bSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 91093fd935bSShri Abhyankar s1 = tmp[i]; 91193fd935bSShri Abhyankar s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 91293fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 91393fd935bSShri Abhyankar tmp[i] = s1; 91493fd935bSShri Abhyankar } 91593fd935bSShri Abhyankar 91693fd935bSShri Abhyankar /* backward solve the L^T */ 91793fd935bSShri Abhyankar for (i=n-1; i>=0; i--) { 91893fd935bSShri Abhyankar v = aa + ai[i]; 91993fd935bSShri Abhyankar vi = aj + ai[i]; 92093fd935bSShri Abhyankar nz = ai[i+1] - ai[i]; 92193fd935bSShri Abhyankar s1 = tmp[i]; 92293fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 92393fd935bSShri Abhyankar } 92493fd935bSShri Abhyankar 92593fd935bSShri Abhyankar /* copy tmp into x according to permutation */ 92693fd935bSShri Abhyankar for (i=0; i<n; i++) x[r[i]] = tmp[i]; 92793fd935bSShri Abhyankar 92893fd935bSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 92993fd935bSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9303649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 93193fd935bSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 93293fd935bSShri Abhyankar 93393fd935bSShri Abhyankar ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 93493fd935bSShri Abhyankar PetscFunctionReturn(0); 93593fd935bSShri Abhyankar } 93693fd935bSShri Abhyankar 93793fd935bSShri Abhyankar #undef __FUNCT__ 93806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 93906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 940f1af5d2fSBarry Smith { 941f1af5d2fSBarry Smith Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 942f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9436849ba73SBarry Smith PetscErrorCode ierr; 9445d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 945b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 946b3260449SShri Abhyankar PetscInt i,nz; 947b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 948b3260449SShri Abhyankar PetscScalar s1,*x,*t; 949b3260449SShri Abhyankar const PetscScalar *b; 950f1af5d2fSBarry Smith 951f1af5d2fSBarry Smith PetscFunctionBegin; 9523649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 9531ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 954f1af5d2fSBarry Smith t = a->solve_work; 955f1af5d2fSBarry Smith 956f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 957f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 958f1af5d2fSBarry Smith 959f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 960*26fbe8dcSKarl Rupp for (i=0; i<n; i++) t[i] = b[c[i]]; 961f1af5d2fSBarry Smith 962f1af5d2fSBarry Smith /* forward solve the U^T */ 963f1af5d2fSBarry Smith for (i=0; i<n; i++) { 964f1af5d2fSBarry Smith 965f1af5d2fSBarry Smith v = aa + diag[i]; 966f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 967f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 968f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 969f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 970f1af5d2fSBarry Smith while (nz--) { 971f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 972f1af5d2fSBarry Smith } 973f1af5d2fSBarry Smith t[i] = s1; 974f1af5d2fSBarry Smith } 975f1af5d2fSBarry Smith /* backward solve the L^T */ 976f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 977f1af5d2fSBarry Smith v = aa + diag[i] - 1; 978f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 979f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 980f1af5d2fSBarry Smith s1 = t[i]; 981f1af5d2fSBarry Smith while (nz--) { 982f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 983f1af5d2fSBarry Smith } 984f1af5d2fSBarry Smith } 985f1af5d2fSBarry Smith 986f1af5d2fSBarry Smith /* copy t into x according to permutation */ 987*26fbe8dcSKarl Rupp for (i=0; i<n; i++) x[r[i]] = t[i]; 988f1af5d2fSBarry Smith 989f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 990f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9913649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 9921ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 993dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 994f1af5d2fSBarry Smith PetscFunctionReturn(0); 995f1af5d2fSBarry Smith } 996f1af5d2fSBarry Smith 9974a2ae208SSatish Balay #undef __FUNCT__ 99806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 99906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1000f1af5d2fSBarry Smith { 1001f1af5d2fSBarry Smith Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1002f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10036849ba73SBarry Smith PetscErrorCode ierr; 10045d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1005b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1006b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1007b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1008b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x,*t; 1009b3260449SShri Abhyankar const PetscScalar *b; 1010f1af5d2fSBarry Smith 1011f1af5d2fSBarry Smith PetscFunctionBegin; 10123649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 10131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1014f1af5d2fSBarry Smith t = a->solve_work; 1015f1af5d2fSBarry Smith 1016f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1017f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1018f1af5d2fSBarry Smith 1019f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1020f1af5d2fSBarry Smith ii = 0; 1021f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1022f1af5d2fSBarry Smith ic = 2*c[i]; 1023f1af5d2fSBarry Smith t[ii] = b[ic]; 1024f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1025f1af5d2fSBarry Smith ii += 2; 1026f1af5d2fSBarry Smith } 1027f1af5d2fSBarry Smith 1028f1af5d2fSBarry Smith /* forward solve the U^T */ 1029f1af5d2fSBarry Smith idx = 0; 1030f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1031f1af5d2fSBarry Smith 1032f1af5d2fSBarry Smith v = aa + 4*diag[i]; 1033f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1034f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1035f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 1036f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 1037f1af5d2fSBarry Smith v += 4; 1038f1af5d2fSBarry Smith 1039f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1040f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1041f1af5d2fSBarry Smith while (nz--) { 1042f1af5d2fSBarry Smith oidx = 2*(*vi++); 1043f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 1044f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 1045f1af5d2fSBarry Smith v += 4; 1046f1af5d2fSBarry Smith } 1047f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1048f1af5d2fSBarry Smith idx += 2; 1049f1af5d2fSBarry Smith } 1050f1af5d2fSBarry Smith /* backward solve the L^T */ 1051f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 1052f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 1053f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1054f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1055f1af5d2fSBarry Smith idt = 2*i; 1056f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1057f1af5d2fSBarry Smith while (nz--) { 1058f1af5d2fSBarry Smith idx = 2*(*vi--); 1059f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 1060f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 1061f1af5d2fSBarry Smith v -= 4; 1062f1af5d2fSBarry Smith } 1063f1af5d2fSBarry Smith } 1064f1af5d2fSBarry Smith 1065f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1066f1af5d2fSBarry Smith ii = 0; 1067f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1068f1af5d2fSBarry Smith ir = 2*r[i]; 1069f1af5d2fSBarry Smith x[ir] = t[ii]; 1070f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1071f1af5d2fSBarry Smith ii += 2; 1072f1af5d2fSBarry Smith } 1073f1af5d2fSBarry Smith 1074f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1075f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 10763649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 10771ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1078dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1079f1af5d2fSBarry Smith PetscFunctionReturn(0); 1080f1af5d2fSBarry Smith } 1081f1af5d2fSBarry Smith 10824a2ae208SSatish Balay #undef __FUNCT__ 10834dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 10844dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 108532121132SShri Abhyankar { 108632121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 108732121132SShri Abhyankar PetscErrorCode ierr; 108832121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1089b3260449SShri Abhyankar const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 109032121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 109132121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1092b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 1093b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1094b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x,*t; 1095b3260449SShri Abhyankar const PetscScalar *b; 109632121132SShri Abhyankar 109732121132SShri Abhyankar PetscFunctionBegin; 10983649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 109932121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 110032121132SShri Abhyankar t = a->solve_work; 110132121132SShri Abhyankar 110232121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 110332121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 110432121132SShri Abhyankar 110532121132SShri Abhyankar /* copy b into temp work space according to permutation */ 110632121132SShri Abhyankar for (i=0; i<n; i++) { 110732121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 110832121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; 110932121132SShri Abhyankar } 111032121132SShri Abhyankar 111132121132SShri Abhyankar /* forward solve the U^T */ 111232121132SShri Abhyankar idx = 0; 111332121132SShri Abhyankar for (i=0; i<n; i++) { 111432121132SShri Abhyankar v = aa + bs2*diag[i]; 111532121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 111632121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 111732121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 111832121132SShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 111932121132SShri Abhyankar v -= bs2; 112032121132SShri Abhyankar 112132121132SShri Abhyankar vi = aj + diag[i] - 1; 112232121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 112332121132SShri Abhyankar for (j=0; j>-nz; j--) { 112432121132SShri Abhyankar oidx = bs*vi[j]; 112532121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2; 112632121132SShri Abhyankar t[oidx+1] -= v[2]*s1 + v[3]*s2; 112732121132SShri Abhyankar v -= bs2; 112832121132SShri Abhyankar } 112932121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 113032121132SShri Abhyankar idx += bs; 113132121132SShri Abhyankar } 113232121132SShri Abhyankar /* backward solve the L^T */ 113332121132SShri Abhyankar for (i=n-1; i>=0; i--) { 113432121132SShri Abhyankar v = aa + bs2*ai[i]; 113532121132SShri Abhyankar vi = aj + ai[i]; 113632121132SShri Abhyankar nz = ai[i+1] - ai[i]; 113732121132SShri Abhyankar idt = bs*i; 113832121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 113932121132SShri Abhyankar for (j=0; j<nz; j++) { 114032121132SShri Abhyankar idx = bs*vi[j]; 114132121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2; 114232121132SShri Abhyankar t[idx+1] -= v[2]*s1 + v[3]*s2; 114332121132SShri Abhyankar v += bs2; 114432121132SShri Abhyankar } 114532121132SShri Abhyankar } 114632121132SShri Abhyankar 114732121132SShri Abhyankar /* copy t into x according to permutation */ 114832121132SShri Abhyankar for (i=0; i<n; i++) { 114932121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 115032121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; 115132121132SShri Abhyankar } 115232121132SShri Abhyankar 115332121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 115432121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11553649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 115632121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 115732121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 115832121132SShri Abhyankar PetscFunctionReturn(0); 115932121132SShri Abhyankar } 116032121132SShri Abhyankar 116132121132SShri Abhyankar #undef __FUNCT__ 116206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 116306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1164f1af5d2fSBarry Smith { 1165f1af5d2fSBarry Smith Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1166f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 11676849ba73SBarry Smith PetscErrorCode ierr; 11685d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1169b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1170b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1171b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1172b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1173b3260449SShri Abhyankar const PetscScalar *b; 1174f1af5d2fSBarry Smith 1175f1af5d2fSBarry Smith PetscFunctionBegin; 11763649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 11771ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1178f1af5d2fSBarry Smith t = a->solve_work; 1179f1af5d2fSBarry Smith 1180f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1181f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1182f1af5d2fSBarry Smith 1183f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1184f1af5d2fSBarry Smith ii = 0; 1185f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1186f1af5d2fSBarry Smith ic = 3*c[i]; 1187f1af5d2fSBarry Smith t[ii] = b[ic]; 1188f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1189f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1190f1af5d2fSBarry Smith ii += 3; 1191f1af5d2fSBarry Smith } 1192f1af5d2fSBarry Smith 1193f1af5d2fSBarry Smith /* forward solve the U^T */ 1194f1af5d2fSBarry Smith idx = 0; 1195f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1196f1af5d2fSBarry Smith 1197f1af5d2fSBarry Smith v = aa + 9*diag[i]; 1198f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1199f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1200f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1201f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1202f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1203f1af5d2fSBarry Smith v += 9; 1204f1af5d2fSBarry Smith 1205f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1206f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1207f1af5d2fSBarry Smith while (nz--) { 1208f1af5d2fSBarry Smith oidx = 3*(*vi++); 1209f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1210f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1211f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1212f1af5d2fSBarry Smith v += 9; 1213f1af5d2fSBarry Smith } 1214f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1215f1af5d2fSBarry Smith idx += 3; 1216f1af5d2fSBarry Smith } 1217f1af5d2fSBarry Smith /* backward solve the L^T */ 1218f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 1219f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 1220f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1221f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1222f1af5d2fSBarry Smith idt = 3*i; 1223f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1224f1af5d2fSBarry Smith while (nz--) { 1225f1af5d2fSBarry Smith idx = 3*(*vi--); 1226f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1227f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1228f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1229f1af5d2fSBarry Smith v -= 9; 1230f1af5d2fSBarry Smith } 1231f1af5d2fSBarry Smith } 1232f1af5d2fSBarry Smith 1233f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1234f1af5d2fSBarry Smith ii = 0; 1235f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1236f1af5d2fSBarry Smith ir = 3*r[i]; 1237f1af5d2fSBarry Smith x[ir] = t[ii]; 1238f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1239f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1240f1af5d2fSBarry Smith ii += 3; 1241f1af5d2fSBarry Smith } 1242f1af5d2fSBarry Smith 1243f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1244f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12453649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 12461ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1247dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1248f1af5d2fSBarry Smith PetscFunctionReturn(0); 1249f1af5d2fSBarry Smith } 1250f1af5d2fSBarry Smith 12514a2ae208SSatish Balay #undef __FUNCT__ 12524dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 12534dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 125432121132SShri Abhyankar { 125532121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 125632121132SShri Abhyankar PetscErrorCode ierr; 125732121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1258b3260449SShri Abhyankar const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 125932121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 126032121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1261b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 1262b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1263b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1264b3260449SShri Abhyankar const PetscScalar *b; 126532121132SShri Abhyankar 126632121132SShri Abhyankar PetscFunctionBegin; 12673649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 126832121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 126932121132SShri Abhyankar t = a->solve_work; 127032121132SShri Abhyankar 127132121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 127232121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 127332121132SShri Abhyankar 127432121132SShri Abhyankar /* copy b into temp work space according to permutation */ 127532121132SShri Abhyankar for (i=0; i<n; i++) { 127632121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 127732121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 127832121132SShri Abhyankar } 127932121132SShri Abhyankar 128032121132SShri Abhyankar /* forward solve the U^T */ 128132121132SShri Abhyankar idx = 0; 128232121132SShri Abhyankar for (i=0; i<n; i++) { 128332121132SShri Abhyankar v = aa + bs2*diag[i]; 128432121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 128532121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 128632121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 128732121132SShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 128832121132SShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 128932121132SShri Abhyankar v -= bs2; 129032121132SShri Abhyankar 129132121132SShri Abhyankar vi = aj + diag[i] - 1; 129232121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 129332121132SShri Abhyankar for (j=0; j>-nz; j--) { 129432121132SShri Abhyankar oidx = bs*vi[j]; 129532121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 129632121132SShri Abhyankar t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 129732121132SShri Abhyankar t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 129832121132SShri Abhyankar v -= bs2; 129932121132SShri Abhyankar } 130032121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 130132121132SShri Abhyankar idx += bs; 130232121132SShri Abhyankar } 130332121132SShri Abhyankar /* backward solve the L^T */ 130432121132SShri Abhyankar for (i=n-1; i>=0; i--) { 130532121132SShri Abhyankar v = aa + bs2*ai[i]; 130632121132SShri Abhyankar vi = aj + ai[i]; 130732121132SShri Abhyankar nz = ai[i+1] - ai[i]; 130832121132SShri Abhyankar idt = bs*i; 130932121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 131032121132SShri Abhyankar for (j=0; j<nz; j++) { 131132121132SShri Abhyankar idx = bs*vi[j]; 131232121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 131332121132SShri Abhyankar t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 131432121132SShri Abhyankar t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 131532121132SShri Abhyankar v += bs2; 131632121132SShri Abhyankar } 131732121132SShri Abhyankar } 131832121132SShri Abhyankar 131932121132SShri Abhyankar /* copy t into x according to permutation */ 132032121132SShri Abhyankar for (i=0; i<n; i++) { 132132121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 132232121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 132332121132SShri Abhyankar } 132432121132SShri Abhyankar 132532121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 132632121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 13273649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 132832121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 132932121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 133032121132SShri Abhyankar PetscFunctionReturn(0); 133132121132SShri Abhyankar } 133232121132SShri Abhyankar 133332121132SShri Abhyankar #undef __FUNCT__ 133406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 133506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1336f1af5d2fSBarry Smith { 1337f1af5d2fSBarry Smith Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1338f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 13396849ba73SBarry Smith PetscErrorCode ierr; 13405d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1341b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1342b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1343b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1344b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1345b3260449SShri Abhyankar const PetscScalar *b; 1346f1af5d2fSBarry Smith 1347f1af5d2fSBarry Smith PetscFunctionBegin; 13483649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 13491ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1350f1af5d2fSBarry Smith t = a->solve_work; 1351f1af5d2fSBarry Smith 1352f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1353f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1354f1af5d2fSBarry Smith 1355f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1356f1af5d2fSBarry Smith ii = 0; 1357f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1358f1af5d2fSBarry Smith ic = 4*c[i]; 1359f1af5d2fSBarry Smith t[ii] = b[ic]; 1360f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1361f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1362f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1363f1af5d2fSBarry Smith ii += 4; 1364f1af5d2fSBarry Smith } 1365f1af5d2fSBarry Smith 1366f1af5d2fSBarry Smith /* forward solve the U^T */ 1367f1af5d2fSBarry Smith idx = 0; 1368f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1369f1af5d2fSBarry Smith 1370f1af5d2fSBarry Smith v = aa + 16*diag[i]; 1371f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1372f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1373f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1374f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1375f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1376f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1377f1af5d2fSBarry Smith v += 16; 1378f1af5d2fSBarry Smith 1379f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1380f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1381f1af5d2fSBarry Smith while (nz--) { 1382f1af5d2fSBarry Smith oidx = 4*(*vi++); 1383f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1384f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1385f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1386f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1387f1af5d2fSBarry Smith v += 16; 1388f1af5d2fSBarry Smith } 1389f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1390f1af5d2fSBarry Smith idx += 4; 1391f1af5d2fSBarry Smith } 1392f1af5d2fSBarry Smith /* backward solve the L^T */ 1393f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 1394f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 1395f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1396f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1397f1af5d2fSBarry Smith idt = 4*i; 1398f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1399f1af5d2fSBarry Smith while (nz--) { 1400f1af5d2fSBarry Smith idx = 4*(*vi--); 1401f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1402f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1403f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1404f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1405f1af5d2fSBarry Smith v -= 16; 1406f1af5d2fSBarry Smith } 1407f1af5d2fSBarry Smith } 1408f1af5d2fSBarry Smith 1409f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1410f1af5d2fSBarry Smith ii = 0; 1411f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1412f1af5d2fSBarry Smith ir = 4*r[i]; 1413f1af5d2fSBarry Smith x[ir] = t[ii]; 1414f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1415f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1416f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1417f1af5d2fSBarry Smith ii += 4; 1418f1af5d2fSBarry Smith } 1419f1af5d2fSBarry Smith 1420f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1421f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 14223649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 14231ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1424dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1425f1af5d2fSBarry Smith PetscFunctionReturn(0); 1426f1af5d2fSBarry Smith } 1427f1af5d2fSBarry Smith 14284a2ae208SSatish Balay #undef __FUNCT__ 14294dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 14304dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 143132121132SShri Abhyankar { 143232121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 143332121132SShri Abhyankar PetscErrorCode ierr; 143432121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1435b3260449SShri Abhyankar const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 143632121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 143732121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1438b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 1439b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1440b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1441b3260449SShri Abhyankar const PetscScalar *b; 144232121132SShri Abhyankar 144332121132SShri Abhyankar PetscFunctionBegin; 14443649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 144532121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 144632121132SShri Abhyankar t = a->solve_work; 144732121132SShri Abhyankar 144832121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 144932121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 145032121132SShri Abhyankar 145132121132SShri Abhyankar /* copy b into temp work space according to permutation */ 145232121132SShri Abhyankar for (i=0; i<n; i++) { 145332121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 145432121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 145532121132SShri Abhyankar } 145632121132SShri Abhyankar 145732121132SShri Abhyankar /* forward solve the U^T */ 145832121132SShri Abhyankar idx = 0; 145932121132SShri Abhyankar for (i=0; i<n; i++) { 146032121132SShri Abhyankar v = aa + bs2*diag[i]; 146132121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 146232121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 146332121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 146432121132SShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 146532121132SShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 146632121132SShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 146732121132SShri Abhyankar v -= bs2; 146832121132SShri Abhyankar 146932121132SShri Abhyankar vi = aj + diag[i] - 1; 147032121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 147132121132SShri Abhyankar for (j=0; j>-nz; j--) { 147232121132SShri Abhyankar oidx = bs*vi[j]; 147332121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 147432121132SShri Abhyankar t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 147532121132SShri Abhyankar t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 147632121132SShri Abhyankar t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 147732121132SShri Abhyankar v -= bs2; 147832121132SShri Abhyankar } 147932121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 148032121132SShri Abhyankar idx += bs; 148132121132SShri Abhyankar } 148232121132SShri Abhyankar /* backward solve the L^T */ 148332121132SShri Abhyankar for (i=n-1; i>=0; i--) { 148432121132SShri Abhyankar v = aa + bs2*ai[i]; 148532121132SShri Abhyankar vi = aj + ai[i]; 148632121132SShri Abhyankar nz = ai[i+1] - ai[i]; 148732121132SShri Abhyankar idt = bs*i; 148832121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 148932121132SShri Abhyankar for (j=0; j<nz; j++) { 149032121132SShri Abhyankar idx = bs*vi[j]; 149132121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 149232121132SShri Abhyankar t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 149332121132SShri Abhyankar t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 149432121132SShri Abhyankar t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 149532121132SShri Abhyankar v += bs2; 149632121132SShri Abhyankar } 149732121132SShri Abhyankar } 149832121132SShri Abhyankar 149932121132SShri Abhyankar /* copy t into x according to permutation */ 150032121132SShri Abhyankar for (i=0; i<n; i++) { 150132121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 150232121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 150332121132SShri Abhyankar } 150432121132SShri Abhyankar 150532121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 150632121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 15073649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 150832121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 150932121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 151032121132SShri Abhyankar PetscFunctionReturn(0); 151132121132SShri Abhyankar } 151232121132SShri Abhyankar 151332121132SShri Abhyankar #undef __FUNCT__ 151406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 151506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1516f1af5d2fSBarry Smith { 1517f1af5d2fSBarry Smith Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1518f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 15196849ba73SBarry Smith PetscErrorCode ierr; 15205d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1521b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1522b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1523b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1524b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1525b3260449SShri Abhyankar const PetscScalar *b; 1526f1af5d2fSBarry Smith 1527f1af5d2fSBarry Smith PetscFunctionBegin; 15283649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 15291ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1530f1af5d2fSBarry Smith t = a->solve_work; 1531f1af5d2fSBarry Smith 1532f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1533f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1534f1af5d2fSBarry Smith 1535f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1536f1af5d2fSBarry Smith ii = 0; 1537f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1538f1af5d2fSBarry Smith ic = 5*c[i]; 1539f1af5d2fSBarry Smith t[ii] = b[ic]; 1540f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1541f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1542f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1543f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1544f1af5d2fSBarry Smith ii += 5; 1545f1af5d2fSBarry Smith } 1546f1af5d2fSBarry Smith 1547f1af5d2fSBarry Smith /* forward solve the U^T */ 1548f1af5d2fSBarry Smith idx = 0; 1549f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1550f1af5d2fSBarry Smith 1551f1af5d2fSBarry Smith v = aa + 25*diag[i]; 1552f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1553f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1554f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1555f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1556f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1557f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1558f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1559f1af5d2fSBarry Smith v += 25; 1560f1af5d2fSBarry Smith 1561f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1562f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1563f1af5d2fSBarry Smith while (nz--) { 1564f1af5d2fSBarry Smith oidx = 5*(*vi++); 1565f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1566f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1567f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1568f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1569f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1570f1af5d2fSBarry Smith v += 25; 1571f1af5d2fSBarry Smith } 1572f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1573f1af5d2fSBarry Smith idx += 5; 1574f1af5d2fSBarry Smith } 1575f1af5d2fSBarry Smith /* backward solve the L^T */ 1576f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 1577f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 1578f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1579f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1580f1af5d2fSBarry Smith idt = 5*i; 1581f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1582f1af5d2fSBarry Smith while (nz--) { 1583f1af5d2fSBarry Smith idx = 5*(*vi--); 1584f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1585f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1586f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1587f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1588f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1589f1af5d2fSBarry Smith v -= 25; 1590f1af5d2fSBarry Smith } 1591f1af5d2fSBarry Smith } 1592f1af5d2fSBarry Smith 1593f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1594f1af5d2fSBarry Smith ii = 0; 1595f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1596f1af5d2fSBarry Smith ir = 5*r[i]; 1597f1af5d2fSBarry Smith x[ir] = t[ii]; 1598f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1599f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1600f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1601f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1602f1af5d2fSBarry Smith ii += 5; 1603f1af5d2fSBarry Smith } 1604f1af5d2fSBarry Smith 1605f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1606f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 16073649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 16081ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1609dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1610f1af5d2fSBarry Smith PetscFunctionReturn(0); 1611f1af5d2fSBarry Smith } 1612f1af5d2fSBarry Smith 16134a2ae208SSatish Balay #undef __FUNCT__ 16144dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 16154dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 161632121132SShri Abhyankar { 161732121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 161832121132SShri Abhyankar PetscErrorCode ierr; 161932121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1620b3260449SShri Abhyankar const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 162132121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 162232121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1623b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 1624b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1625b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1626b3260449SShri Abhyankar const PetscScalar *b; 162732121132SShri Abhyankar 162832121132SShri Abhyankar PetscFunctionBegin; 16293649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 163032121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 163132121132SShri Abhyankar t = a->solve_work; 163232121132SShri Abhyankar 163332121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 163432121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 163532121132SShri Abhyankar 163632121132SShri Abhyankar /* copy b into temp work space according to permutation */ 163732121132SShri Abhyankar for (i=0; i<n; i++) { 163832121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 163932121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 164032121132SShri Abhyankar t[ii+4] = b[ic+4]; 164132121132SShri Abhyankar } 164232121132SShri Abhyankar 164332121132SShri Abhyankar /* forward solve the U^T */ 164432121132SShri Abhyankar idx = 0; 164532121132SShri Abhyankar for (i=0; i<n; i++) { 164632121132SShri Abhyankar v = aa + bs2*diag[i]; 164732121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 164832121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 164932121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 165032121132SShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 165132121132SShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 165232121132SShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 165332121132SShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 165432121132SShri Abhyankar v -= bs2; 165532121132SShri Abhyankar 165632121132SShri Abhyankar vi = aj + diag[i] - 1; 165732121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 165832121132SShri Abhyankar for (j=0; j>-nz; j--) { 165932121132SShri Abhyankar oidx = bs*vi[j]; 166032121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 166132121132SShri Abhyankar t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 166232121132SShri Abhyankar t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 166332121132SShri Abhyankar t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 166432121132SShri Abhyankar t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 166532121132SShri Abhyankar v -= bs2; 166632121132SShri Abhyankar } 166732121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 166832121132SShri Abhyankar idx += bs; 166932121132SShri Abhyankar } 167032121132SShri Abhyankar /* backward solve the L^T */ 167132121132SShri Abhyankar for (i=n-1; i>=0; i--) { 167232121132SShri Abhyankar v = aa + bs2*ai[i]; 167332121132SShri Abhyankar vi = aj + ai[i]; 167432121132SShri Abhyankar nz = ai[i+1] - ai[i]; 167532121132SShri Abhyankar idt = bs*i; 167632121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 167732121132SShri Abhyankar for (j=0; j<nz; j++) { 167832121132SShri Abhyankar idx = bs*vi[j]; 167932121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 168032121132SShri Abhyankar t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 168132121132SShri Abhyankar t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 168232121132SShri Abhyankar t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 168332121132SShri Abhyankar t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 168432121132SShri Abhyankar v += bs2; 168532121132SShri Abhyankar } 168632121132SShri Abhyankar } 168732121132SShri Abhyankar 168832121132SShri Abhyankar /* copy t into x according to permutation */ 168932121132SShri Abhyankar for (i=0; i<n; i++) { 169032121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 169132121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 169232121132SShri Abhyankar x[ir+4] = t[ii+4]; 169332121132SShri Abhyankar } 169432121132SShri Abhyankar 169532121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 169632121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 16973649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 169832121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 169932121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 170032121132SShri Abhyankar PetscFunctionReturn(0); 170132121132SShri Abhyankar } 170232121132SShri Abhyankar 170332121132SShri Abhyankar #undef __FUNCT__ 170406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 170506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1706f1af5d2fSBarry Smith { 1707f1af5d2fSBarry Smith Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1708f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 17096849ba73SBarry Smith PetscErrorCode ierr; 17105d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1711b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1712b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1713b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1714b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1715b3260449SShri Abhyankar const PetscScalar *b; 1716f1af5d2fSBarry Smith 1717f1af5d2fSBarry Smith PetscFunctionBegin; 17183649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 17191ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1720f1af5d2fSBarry Smith t = a->solve_work; 1721f1af5d2fSBarry Smith 1722f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1723f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1724f1af5d2fSBarry Smith 1725f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1726f1af5d2fSBarry Smith ii = 0; 1727f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1728f1af5d2fSBarry Smith ic = 6*c[i]; 1729f1af5d2fSBarry Smith t[ii] = b[ic]; 1730f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1731f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1732f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1733f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1734f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1735f1af5d2fSBarry Smith ii += 6; 1736f1af5d2fSBarry Smith } 1737f1af5d2fSBarry Smith 1738f1af5d2fSBarry Smith /* forward solve the U^T */ 1739f1af5d2fSBarry Smith idx = 0; 1740f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1741f1af5d2fSBarry Smith 1742f1af5d2fSBarry Smith v = aa + 36*diag[i]; 1743f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1744f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1745f1af5d2fSBarry Smith x6 = t[5+idx]; 1746f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1747f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1748f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1749f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1750f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1751f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1752f1af5d2fSBarry Smith v += 36; 1753f1af5d2fSBarry Smith 1754f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1755f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1756f1af5d2fSBarry Smith while (nz--) { 1757f1af5d2fSBarry Smith oidx = 6*(*vi++); 1758f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1759f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1760f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1761f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1762f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1763f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1764f1af5d2fSBarry Smith v += 36; 1765f1af5d2fSBarry Smith } 1766f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1767f1af5d2fSBarry Smith t[5+idx] = s6; 1768f1af5d2fSBarry Smith idx += 6; 1769f1af5d2fSBarry Smith } 1770f1af5d2fSBarry Smith /* backward solve the L^T */ 1771f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 1772f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 1773f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1774f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1775f1af5d2fSBarry Smith idt = 6*i; 1776f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1777f1af5d2fSBarry Smith s6 = t[5+idt]; 1778f1af5d2fSBarry Smith while (nz--) { 1779f1af5d2fSBarry Smith idx = 6*(*vi--); 1780f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1781f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1782f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1783f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1784f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1785f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1786f1af5d2fSBarry Smith v -= 36; 1787f1af5d2fSBarry Smith } 1788f1af5d2fSBarry Smith } 1789f1af5d2fSBarry Smith 1790f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1791f1af5d2fSBarry Smith ii = 0; 1792f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1793f1af5d2fSBarry Smith ir = 6*r[i]; 1794f1af5d2fSBarry Smith x[ir] = t[ii]; 1795f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1796f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1797f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1798f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1799f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1800f1af5d2fSBarry Smith ii += 6; 1801f1af5d2fSBarry Smith } 1802f1af5d2fSBarry Smith 1803f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1804f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 18053649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 18061ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1807dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1808f1af5d2fSBarry Smith PetscFunctionReturn(0); 1809f1af5d2fSBarry Smith } 1810f1af5d2fSBarry Smith 18114a2ae208SSatish Balay #undef __FUNCT__ 18124dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 18134dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 181432121132SShri Abhyankar { 181532121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 181632121132SShri Abhyankar PetscErrorCode ierr; 181732121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1818b3260449SShri Abhyankar const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 181932121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 182032121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1821b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 1822b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1823b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1824b3260449SShri Abhyankar const PetscScalar *b; 182532121132SShri Abhyankar 182632121132SShri Abhyankar PetscFunctionBegin; 18273649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 182832121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 182932121132SShri Abhyankar t = a->solve_work; 183032121132SShri Abhyankar 183132121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 183232121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 183332121132SShri Abhyankar 183432121132SShri Abhyankar /* copy b into temp work space according to permutation */ 183532121132SShri Abhyankar for (i=0; i<n; i++) { 183632121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 183732121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 183832121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 183932121132SShri Abhyankar } 184032121132SShri Abhyankar 184132121132SShri Abhyankar /* forward solve the U^T */ 184232121132SShri Abhyankar idx = 0; 184332121132SShri Abhyankar for (i=0; i<n; i++) { 184432121132SShri Abhyankar v = aa + bs2*diag[i]; 184532121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 184632121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 184732121132SShri Abhyankar x6 = t[5+idx]; 184832121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 184932121132SShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 185032121132SShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 185132121132SShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 185232121132SShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 185332121132SShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 185432121132SShri Abhyankar v -= bs2; 185532121132SShri Abhyankar 185632121132SShri Abhyankar vi = aj + diag[i] - 1; 185732121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 185832121132SShri Abhyankar for (j=0; j>-nz; j--) { 185932121132SShri Abhyankar oidx = bs*vi[j]; 186032121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 186132121132SShri Abhyankar t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 186232121132SShri Abhyankar t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 186332121132SShri Abhyankar t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 186432121132SShri Abhyankar t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 186532121132SShri Abhyankar t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 186632121132SShri Abhyankar v -= bs2; 186732121132SShri Abhyankar } 186832121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 186932121132SShri Abhyankar t[5+idx] = s6; 187032121132SShri Abhyankar idx += bs; 187132121132SShri Abhyankar } 187232121132SShri Abhyankar /* backward solve the L^T */ 187332121132SShri Abhyankar for (i=n-1; i>=0; i--) { 187432121132SShri Abhyankar v = aa + bs2*ai[i]; 187532121132SShri Abhyankar vi = aj + ai[i]; 187632121132SShri Abhyankar nz = ai[i+1] - ai[i]; 187732121132SShri Abhyankar idt = bs*i; 187832121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 187932121132SShri Abhyankar s6 = t[5+idt]; 188032121132SShri Abhyankar for (j=0; j<nz; j++) { 188132121132SShri Abhyankar idx = bs*vi[j]; 188232121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 188332121132SShri Abhyankar t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 188432121132SShri Abhyankar t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 188532121132SShri Abhyankar t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 188632121132SShri Abhyankar t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 188732121132SShri Abhyankar t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 188832121132SShri Abhyankar v += bs2; 188932121132SShri Abhyankar } 189032121132SShri Abhyankar } 189132121132SShri Abhyankar 189232121132SShri Abhyankar /* copy t into x according to permutation */ 189332121132SShri Abhyankar for (i=0; i<n; i++) { 189432121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 189532121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 189632121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 189732121132SShri Abhyankar } 189832121132SShri Abhyankar 189932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 190032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 19013649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 190232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 190332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 190432121132SShri Abhyankar PetscFunctionReturn(0); 190532121132SShri Abhyankar } 190632121132SShri Abhyankar 190732121132SShri Abhyankar #undef __FUNCT__ 190806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 190906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1910f1af5d2fSBarry Smith { 1911f1af5d2fSBarry Smith Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 1912f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 19136849ba73SBarry Smith PetscErrorCode ierr; 19145d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1915b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1916b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1917b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1918b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1919b3260449SShri Abhyankar const PetscScalar *b; 1920f1af5d2fSBarry Smith 1921f1af5d2fSBarry Smith PetscFunctionBegin; 19223649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 19231ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1924f1af5d2fSBarry Smith t = a->solve_work; 1925f1af5d2fSBarry Smith 1926f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1927f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1928f1af5d2fSBarry Smith 1929f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1930f1af5d2fSBarry Smith ii = 0; 1931f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1932f1af5d2fSBarry Smith ic = 7*c[i]; 1933f1af5d2fSBarry Smith t[ii] = b[ic]; 1934f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1935f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1936f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1937f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1938f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1939f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1940f1af5d2fSBarry Smith ii += 7; 1941f1af5d2fSBarry Smith } 1942f1af5d2fSBarry Smith 1943f1af5d2fSBarry Smith /* forward solve the U^T */ 1944f1af5d2fSBarry Smith idx = 0; 1945f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1946f1af5d2fSBarry Smith 1947f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1948f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1949f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1950f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1951f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1952f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1953f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1954f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1955f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1956f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1957f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1958f1af5d2fSBarry Smith v += 49; 1959f1af5d2fSBarry Smith 1960f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1961f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1962f1af5d2fSBarry Smith while (nz--) { 1963f1af5d2fSBarry Smith oidx = 7*(*vi++); 1964f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1965f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1966f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1967f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1968f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1969f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1970f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1971f1af5d2fSBarry Smith v += 49; 1972f1af5d2fSBarry Smith } 1973f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1974f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1975f1af5d2fSBarry Smith idx += 7; 1976f1af5d2fSBarry Smith } 1977f1af5d2fSBarry Smith /* backward solve the L^T */ 1978f1af5d2fSBarry Smith for (i=n-1; i>=0; i--) { 1979f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1980f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1981f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1982f1af5d2fSBarry Smith idt = 7*i; 1983f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1984f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1985f1af5d2fSBarry Smith while (nz--) { 1986f1af5d2fSBarry Smith idx = 7*(*vi--); 1987f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1988f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1989f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1990f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1991f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1992f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1993f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1994f1af5d2fSBarry Smith v -= 49; 1995f1af5d2fSBarry Smith } 1996f1af5d2fSBarry Smith } 1997f1af5d2fSBarry Smith 1998f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1999f1af5d2fSBarry Smith ii = 0; 2000f1af5d2fSBarry Smith for (i=0; i<n; i++) { 2001f1af5d2fSBarry Smith ir = 7*r[i]; 2002f1af5d2fSBarry Smith x[ir] = t[ii]; 2003f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 2004f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 2005f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 2006f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 2007f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 2008f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 2009f1af5d2fSBarry Smith ii += 7; 2010f1af5d2fSBarry Smith } 2011f1af5d2fSBarry Smith 2012f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2013f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 20143649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 20151ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2016dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2017f1af5d2fSBarry Smith PetscFunctionReturn(0); 2018f1af5d2fSBarry Smith } 201932121132SShri Abhyankar #undef __FUNCT__ 20204dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 20214dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 202232121132SShri Abhyankar { 202332121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 202432121132SShri Abhyankar PetscErrorCode ierr; 202532121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 2026b3260449SShri Abhyankar const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 202732121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 202832121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 2029b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 2030b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2031b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2032b3260449SShri Abhyankar const PetscScalar *b; 203332121132SShri Abhyankar 203432121132SShri Abhyankar PetscFunctionBegin; 20353649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 203632121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 203732121132SShri Abhyankar t = a->solve_work; 203832121132SShri Abhyankar 203932121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 204032121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 204132121132SShri Abhyankar 204232121132SShri Abhyankar /* copy b into temp work space according to permutation */ 204332121132SShri Abhyankar for (i=0; i<n; i++) { 204432121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 204532121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 204632121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 204732121132SShri Abhyankar } 204832121132SShri Abhyankar 204932121132SShri Abhyankar /* forward solve the U^T */ 205032121132SShri Abhyankar idx = 0; 205132121132SShri Abhyankar for (i=0; i<n; i++) { 205232121132SShri Abhyankar v = aa + bs2*diag[i]; 205332121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 205432121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 205532121132SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 205632121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 205732121132SShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 205832121132SShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 205932121132SShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 206032121132SShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 206132121132SShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 206232121132SShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 206332121132SShri Abhyankar v -= bs2; 206432121132SShri Abhyankar 206532121132SShri Abhyankar vi = aj + diag[i] - 1; 206632121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 206732121132SShri Abhyankar for (j=0; j>-nz; j--) { 206832121132SShri Abhyankar oidx = bs*vi[j]; 206932121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 207032121132SShri Abhyankar t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 207132121132SShri Abhyankar t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 207232121132SShri Abhyankar t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 207332121132SShri Abhyankar t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 207432121132SShri Abhyankar t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 207532121132SShri Abhyankar t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 207632121132SShri Abhyankar v -= bs2; 207732121132SShri Abhyankar } 207832121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 207932121132SShri Abhyankar t[5+idx] = s6; t[6+idx] = s7; 208032121132SShri Abhyankar idx += bs; 208132121132SShri Abhyankar } 208232121132SShri Abhyankar /* backward solve the L^T */ 208332121132SShri Abhyankar for (i=n-1; i>=0; i--) { 208432121132SShri Abhyankar v = aa + bs2*ai[i]; 208532121132SShri Abhyankar vi = aj + ai[i]; 208632121132SShri Abhyankar nz = ai[i+1] - ai[i]; 208732121132SShri Abhyankar idt = bs*i; 208832121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 208932121132SShri Abhyankar s6 = t[5+idt]; s7 = t[6+idt]; 209032121132SShri Abhyankar for (j=0; j<nz; j++) { 209132121132SShri Abhyankar idx = bs*vi[j]; 209232121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 209332121132SShri Abhyankar t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 209432121132SShri Abhyankar t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 209532121132SShri Abhyankar t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 209632121132SShri Abhyankar t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 209732121132SShri Abhyankar t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 209832121132SShri Abhyankar t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 209932121132SShri Abhyankar v += bs2; 210032121132SShri Abhyankar } 210132121132SShri Abhyankar } 210232121132SShri Abhyankar 210332121132SShri Abhyankar /* copy t into x according to permutation */ 210432121132SShri Abhyankar for (i=0; i<n; i++) { 210532121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 210632121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 210732121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 210832121132SShri Abhyankar } 210932121132SShri Abhyankar 211032121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 211132121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21123649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 211332121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 211432121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 211532121132SShri Abhyankar PetscFunctionReturn(0); 211632121132SShri Abhyankar } 2117f1af5d2fSBarry Smith 21184e2b4712SSatish Balay /* ----------------------------------------------------------- */ 21194a2ae208SSatish Balay #undef __FUNCT__ 212006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 212106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 21224e2b4712SSatish Balay { 21234e2b4712SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 21244e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 21256849ba73SBarry Smith PetscErrorCode ierr; 2126b3260449SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 2127b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 2128b3260449SShri Abhyankar PetscInt i,nz; 2129b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 2130b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2131b3260449SShri Abhyankar PetscScalar *x,*s,*t,*ls; 2132b3260449SShri Abhyankar const PetscScalar *b; 21334e2b4712SSatish Balay 21344e2b4712SSatish Balay PetscFunctionBegin; 21353649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 21361ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2137f1af5d2fSBarry Smith t = a->solve_work; 21384e2b4712SSatish Balay 21394e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 21404e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 21414e2b4712SSatish Balay 21424e2b4712SSatish Balay /* forward solve the lower triangular */ 214387828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 21444e2b4712SSatish Balay for (i=1; i<n; i++) { 21454e2b4712SSatish Balay v = aa + bs2*ai[i]; 21464e2b4712SSatish Balay vi = aj + ai[i]; 21474e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 2148f1af5d2fSBarry Smith s = t + bs*i; 214987828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 21504e2b4712SSatish Balay while (nz--) { 215196b95a6bSBarry Smith PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 21524e2b4712SSatish Balay v += bs2; 21534e2b4712SSatish Balay } 21544e2b4712SSatish Balay } 21554e2b4712SSatish Balay /* backward solve the upper triangular */ 2156d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 21574e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 21584e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 21594e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 21604e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 216187828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 21624e2b4712SSatish Balay while (nz--) { 216396b95a6bSBarry Smith PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 21644e2b4712SSatish Balay v += bs2; 21654e2b4712SSatish Balay } 216696b95a6bSBarry Smith PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 216787828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 21684e2b4712SSatish Balay } 21694e2b4712SSatish Balay 21704e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 21714e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21723649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 21731ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2174dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 21754e2b4712SSatish Balay PetscFunctionReturn(0); 21764e2b4712SSatish Balay } 21774e2b4712SSatish Balay 21785c42ef9dSBarry Smith /* ----------------------------------------------------------- */ 21795c42ef9dSBarry Smith #undef __FUNCT__ 218006e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 218106e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 21825c42ef9dSBarry Smith { 21835c42ef9dSBarry Smith Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 21845c42ef9dSBarry Smith IS iscol=a->col,isrow=a->row; 21855c42ef9dSBarry Smith PetscErrorCode ierr; 21865c42ef9dSBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2187b3260449SShri Abhyankar PetscInt i,nz,j; 2188b3260449SShri Abhyankar const PetscInt n =a->mbs,bs=A->rmap->bs,bs2=a->bs2; 21895c42ef9dSBarry Smith const MatScalar *aa=a->a,*v; 21905c42ef9dSBarry Smith PetscScalar *x,*t,*ls; 21915c42ef9dSBarry Smith const PetscScalar *b; 21926e111a19SKarl Rupp 21935c42ef9dSBarry Smith PetscFunctionBegin; 21943649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 21955c42ef9dSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 21965c42ef9dSBarry Smith t = a->solve_work; 21975c42ef9dSBarry Smith 21985c42ef9dSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 21995c42ef9dSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 22005c42ef9dSBarry Smith 22015c42ef9dSBarry Smith /* copy the b into temp work space according to permutation */ 22025c42ef9dSBarry Smith for (i=0; i<n; i++) { 22035c42ef9dSBarry Smith for (j=0; j<bs; j++) { 22045c42ef9dSBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 22055c42ef9dSBarry Smith } 22065c42ef9dSBarry Smith } 22075c42ef9dSBarry Smith 22085c42ef9dSBarry Smith 22095c42ef9dSBarry Smith /* forward solve the upper triangular transpose */ 22105c42ef9dSBarry Smith ls = a->solve_work + A->cmap->n; 22115c42ef9dSBarry Smith for (i=0; i<n; i++) { 22125c42ef9dSBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 221396b95a6bSBarry Smith PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 22145c42ef9dSBarry Smith v = aa + bs2*(a->diag[i] + 1); 22155c42ef9dSBarry Smith vi = aj + a->diag[i] + 1; 22165c42ef9dSBarry Smith nz = ai[i+1] - a->diag[i] - 1; 22175c42ef9dSBarry Smith while (nz--) { 221896b95a6bSBarry Smith PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 22195c42ef9dSBarry Smith v += bs2; 22205c42ef9dSBarry Smith } 22215c42ef9dSBarry Smith } 22225c42ef9dSBarry Smith 22235c42ef9dSBarry Smith /* backward solve the lower triangular transpose */ 22245c42ef9dSBarry Smith for (i=n-1; i>=0; i--) { 22255c42ef9dSBarry Smith v = aa + bs2*ai[i]; 22265c42ef9dSBarry Smith vi = aj + ai[i]; 22275c42ef9dSBarry Smith nz = a->diag[i] - ai[i]; 22285c42ef9dSBarry Smith while (nz--) { 222996b95a6bSBarry Smith PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 22305c42ef9dSBarry Smith v += bs2; 22315c42ef9dSBarry Smith } 22325c42ef9dSBarry Smith } 22335c42ef9dSBarry Smith 22345c42ef9dSBarry Smith /* copy t into x according to permutation */ 22355c42ef9dSBarry Smith for (i=0; i<n; i++) { 22365c42ef9dSBarry Smith for (j=0; j<bs; j++) { 22375c42ef9dSBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 22385c42ef9dSBarry Smith } 22395c42ef9dSBarry Smith } 22405c42ef9dSBarry Smith 22415c42ef9dSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22425c42ef9dSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 22433649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 22445c42ef9dSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 22455c42ef9dSBarry Smith ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 22465c42ef9dSBarry Smith PetscFunctionReturn(0); 22475c42ef9dSBarry Smith } 22485c42ef9dSBarry Smith 22494a2ae208SSatish Balay #undef __FUNCT__ 22504dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 22514dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 22528499736aSShri Abhyankar { 22538499736aSShri Abhyankar Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 22548499736aSShri Abhyankar IS iscol=a->col,isrow=a->row; 22558499736aSShri Abhyankar PetscErrorCode ierr; 2256b3260449SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 2257b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2258b3260449SShri Abhyankar PetscInt i,j,nz; 2259b3260449SShri Abhyankar const PetscInt bs =A->rmap->bs,bs2=a->bs2; 22608499736aSShri Abhyankar const MatScalar *aa=a->a,*v; 22618499736aSShri Abhyankar PetscScalar *x,*t,*ls; 22628499736aSShri Abhyankar const PetscScalar *b; 2263b3260449SShri Abhyankar 22648499736aSShri Abhyankar PetscFunctionBegin; 22653649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 22668499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22678499736aSShri Abhyankar t = a->solve_work; 22688499736aSShri Abhyankar 22698499736aSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 22708499736aSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 22718499736aSShri Abhyankar 22728499736aSShri Abhyankar /* copy the b into temp work space according to permutation */ 22738499736aSShri Abhyankar for (i=0; i<n; i++) { 22748499736aSShri Abhyankar for (j=0; j<bs; j++) { 22758499736aSShri Abhyankar t[i*bs+j] = b[c[i]*bs+j]; 22768499736aSShri Abhyankar } 22778499736aSShri Abhyankar } 22788499736aSShri Abhyankar 22798499736aSShri Abhyankar 22808499736aSShri Abhyankar /* forward solve the upper triangular transpose */ 22818499736aSShri Abhyankar ls = a->solve_work + A->cmap->n; 22828499736aSShri Abhyankar for (i=0; i<n; i++) { 22838499736aSShri Abhyankar ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 228496b95a6bSBarry Smith PetscKernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 22858499736aSShri Abhyankar v = aa + bs2*(diag[i] - 1); 22868499736aSShri Abhyankar vi = aj + diag[i] - 1; 22878499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 22888499736aSShri Abhyankar for (j=0; j>-nz; j--) { 228996b95a6bSBarry Smith PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 22908499736aSShri Abhyankar v -= bs2; 22918499736aSShri Abhyankar } 22928499736aSShri Abhyankar } 22938499736aSShri Abhyankar 22948499736aSShri Abhyankar /* backward solve the lower triangular transpose */ 22958499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 22968499736aSShri Abhyankar v = aa + bs2*ai[i]; 22978499736aSShri Abhyankar vi = aj + ai[i]; 22988499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 22998499736aSShri Abhyankar for (j=0; j<nz; j++) { 230096b95a6bSBarry Smith PetscKernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 23018499736aSShri Abhyankar v += bs2; 23028499736aSShri Abhyankar } 23038499736aSShri Abhyankar } 23048499736aSShri Abhyankar 23058499736aSShri Abhyankar /* copy t into x according to permutation */ 23068499736aSShri Abhyankar for (i=0; i<n; i++) { 23078499736aSShri Abhyankar for (j=0; j<bs; j++) { 23088499736aSShri Abhyankar x[bs*r[i]+j] = t[bs*i+j]; 23098499736aSShri Abhyankar } 23108499736aSShri Abhyankar } 23118499736aSShri Abhyankar 23128499736aSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 23138499736aSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 23143649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 23158499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 23168499736aSShri Abhyankar ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 23178499736aSShri Abhyankar PetscFunctionReturn(0); 23188499736aSShri Abhyankar } 23198499736aSShri Abhyankar 2320832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */ 232129a97285SShri Abhyankar 23222b0b2ea7SShri Abhyankar #undef __FUNCT__ 2323832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2" 2324832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx) 23252b0b2ea7SShri Abhyankar { 23262b0b2ea7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 23272b0b2ea7SShri Abhyankar PetscErrorCode ierr; 2328b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 23290fa040f9SShri Abhyankar PetscInt i,nz,idx,idt,m; 23300b68f018SBarry Smith const MatScalar *aa=a->a,*v; 23312b0b2ea7SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 23322b0b2ea7SShri Abhyankar PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 23330fa040f9SShri Abhyankar PetscScalar *x; 23340b68f018SBarry Smith const PetscScalar *b; 23352b0b2ea7SShri Abhyankar 23362b0b2ea7SShri Abhyankar PetscFunctionBegin; 23373649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 23382b0b2ea7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23392b0b2ea7SShri Abhyankar 23402b0b2ea7SShri Abhyankar /* forward solve the lower triangular */ 234129a97285SShri Abhyankar idx = 0; 23420fa040f9SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx]; 23430fa040f9SShri Abhyankar x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx]; 23440fa040f9SShri Abhyankar x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx]; 23452b0b2ea7SShri Abhyankar 23462b0b2ea7SShri Abhyankar for (i=1; i<n; i++) { 23472b0b2ea7SShri Abhyankar v = aa + bs2*ai[i]; 23482b0b2ea7SShri Abhyankar vi = aj + ai[i]; 23492b0b2ea7SShri Abhyankar nz = ai[i+1] - ai[i]; 23500fa040f9SShri Abhyankar idt = bs*i; 23510fa040f9SShri Abhyankar s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt]; 23520fa040f9SShri Abhyankar s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt]; 23530fa040f9SShri Abhyankar s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt]; 23542b0b2ea7SShri Abhyankar for (m=0; m<nz; m++) { 23552b0b2ea7SShri Abhyankar idx = bs*vi[m]; 23560fa040f9SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 23570fa040f9SShri Abhyankar x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 23580fa040f9SShri Abhyankar x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 23592b0b2ea7SShri Abhyankar 23600b8f6341SShri Abhyankar 23612b0b2ea7SShri Abhyankar s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 23622b0b2ea7SShri Abhyankar s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 23632b0b2ea7SShri Abhyankar s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 23642b0b2ea7SShri Abhyankar s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 23652b0b2ea7SShri Abhyankar s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 23662b0b2ea7SShri Abhyankar s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 23672b0b2ea7SShri Abhyankar s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 23682b0b2ea7SShri Abhyankar s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 23692b0b2ea7SShri Abhyankar s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 23702b0b2ea7SShri Abhyankar s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 23712b0b2ea7SShri Abhyankar s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 23722b0b2ea7SShri Abhyankar s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 23732b0b2ea7SShri Abhyankar s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 23742b0b2ea7SShri Abhyankar s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 23752b0b2ea7SShri Abhyankar s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 23762b0b2ea7SShri Abhyankar 23772b0b2ea7SShri Abhyankar v += bs2; 23782b0b2ea7SShri Abhyankar } 23790fa040f9SShri Abhyankar x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5; 23800fa040f9SShri Abhyankar x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10; 23810fa040f9SShri Abhyankar x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15; 23822b0b2ea7SShri Abhyankar 23832b0b2ea7SShri Abhyankar } 23842b0b2ea7SShri Abhyankar /* backward solve the upper triangular */ 23852b0b2ea7SShri Abhyankar for (i=n-1; i>=0; i--) { 23862b0b2ea7SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 23872b0b2ea7SShri Abhyankar vi = aj + adiag[i+1]+1; 23882b0b2ea7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 23892b0b2ea7SShri Abhyankar idt = bs*i; 23900fa040f9SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 23910fa040f9SShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt]; 23920fa040f9SShri Abhyankar s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt]; 23932b0b2ea7SShri Abhyankar 23942b0b2ea7SShri Abhyankar for (m=0; m<nz; m++) { 23952b0b2ea7SShri Abhyankar idx = bs*vi[m]; 23960fa040f9SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 23970fa040f9SShri Abhyankar x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 23980fa040f9SShri Abhyankar x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 23992b0b2ea7SShri Abhyankar 24002b0b2ea7SShri Abhyankar s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 24012b0b2ea7SShri Abhyankar s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 24022b0b2ea7SShri Abhyankar s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 24032b0b2ea7SShri Abhyankar s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 24042b0b2ea7SShri Abhyankar s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 24052b0b2ea7SShri Abhyankar s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 24062b0b2ea7SShri Abhyankar s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 24072b0b2ea7SShri Abhyankar s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 24082b0b2ea7SShri Abhyankar s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 24092b0b2ea7SShri Abhyankar s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 24102b0b2ea7SShri Abhyankar s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 24112b0b2ea7SShri Abhyankar s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 24122b0b2ea7SShri Abhyankar s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 24132b0b2ea7SShri Abhyankar s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 24142b0b2ea7SShri Abhyankar s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 24152b0b2ea7SShri Abhyankar 24162b0b2ea7SShri Abhyankar v += bs2; 24172b0b2ea7SShri Abhyankar } 24182b0b2ea7SShri Abhyankar 24190fa040f9SShri Abhyankar x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 24200fa040f9SShri Abhyankar x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 24210fa040f9SShri Abhyankar x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 24220fa040f9SShri Abhyankar x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 24230fa040f9SShri Abhyankar x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 24240fa040f9SShri Abhyankar x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 24250fa040f9SShri Abhyankar x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 24260fa040f9SShri Abhyankar x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 24270fa040f9SShri Abhyankar x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 24280fa040f9SShri Abhyankar x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 24290fa040f9SShri Abhyankar x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 24300fa040f9SShri Abhyankar x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 24310fa040f9SShri Abhyankar x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 24320fa040f9SShri Abhyankar x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 24330fa040f9SShri Abhyankar x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 24342b0b2ea7SShri Abhyankar 24352b0b2ea7SShri Abhyankar } 24362b0b2ea7SShri Abhyankar 24373649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 24382b0b2ea7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 24392b0b2ea7SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 24402b0b2ea7SShri Abhyankar PetscFunctionReturn(0); 24412b0b2ea7SShri Abhyankar } 24422b0b2ea7SShri Abhyankar 2443832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */ 2444832cc040SShri Abhyankar /* Default MatSolve for block size 15 */ 2445832cc040SShri Abhyankar 24468499736aSShri Abhyankar #undef __FUNCT__ 2447832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1" 2448832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx) 24490b8f6341SShri Abhyankar { 24500b8f6341SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 24510b8f6341SShri Abhyankar PetscErrorCode ierr; 24520b8f6341SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 245353ef36baSBarry Smith PetscInt i,k,nz,idx,idt,m; 24540b8f6341SShri Abhyankar const MatScalar *aa=a->a,*v; 24550b8f6341SShri Abhyankar PetscScalar s[15]; 245653ef36baSBarry Smith PetscScalar *x,xv; 24570b8f6341SShri Abhyankar const PetscScalar *b; 24580b8f6341SShri Abhyankar 24590b8f6341SShri Abhyankar PetscFunctionBegin; 24603649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 24610b8f6341SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 24620b8f6341SShri Abhyankar 24630b8f6341SShri Abhyankar /* forward solve the lower triangular */ 2464832cc040SShri Abhyankar for (i=0; i<n; i++) { 24650b8f6341SShri Abhyankar v = aa + bs2*ai[i]; 24660b8f6341SShri Abhyankar vi = aj + ai[i]; 24670b8f6341SShri Abhyankar nz = ai[i+1] - ai[i]; 24680fa040f9SShri Abhyankar idt = bs*i; 2469832cc040SShri Abhyankar x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 2470832cc040SShri Abhyankar x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 2471832cc040SShri Abhyankar x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt]; 24720b8f6341SShri Abhyankar for (m=0; m<nz; m++) { 24730b8f6341SShri Abhyankar idx = bs*vi[m]; 24740b8f6341SShri Abhyankar for (k=0; k<15; k++) { 247553ef36baSBarry Smith xv = x[k + idx]; 247653ef36baSBarry Smith x[idt] -= v[0]*xv; 247753ef36baSBarry Smith x[1+idt] -= v[1]*xv; 247853ef36baSBarry Smith x[2+idt] -= v[2]*xv; 247953ef36baSBarry Smith x[3+idt] -= v[3]*xv; 248053ef36baSBarry Smith x[4+idt] -= v[4]*xv; 248153ef36baSBarry Smith x[5+idt] -= v[5]*xv; 248253ef36baSBarry Smith x[6+idt] -= v[6]*xv; 248353ef36baSBarry Smith x[7+idt] -= v[7]*xv; 248453ef36baSBarry Smith x[8+idt] -= v[8]*xv; 248553ef36baSBarry Smith x[9+idt] -= v[9]*xv; 248653ef36baSBarry Smith x[10+idt] -= v[10]*xv; 248753ef36baSBarry Smith x[11+idt] -= v[11]*xv; 248853ef36baSBarry Smith x[12+idt] -= v[12]*xv; 248953ef36baSBarry Smith x[13+idt] -= v[13]*xv; 249053ef36baSBarry Smith x[14+idt] -= v[14]*xv; 24910b8f6341SShri Abhyankar v += 15; 24920b8f6341SShri Abhyankar } 24930b8f6341SShri Abhyankar } 24940b8f6341SShri Abhyankar } 24950b8f6341SShri Abhyankar /* backward solve the upper triangular */ 24960b8f6341SShri Abhyankar for (i=n-1; i>=0; i--) { 24970b8f6341SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 24980b8f6341SShri Abhyankar vi = aj + adiag[i+1]+1; 24990b8f6341SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 25000b8f6341SShri Abhyankar idt = bs*i; 25010fa040f9SShri Abhyankar s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 25020fa040f9SShri Abhyankar s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 25030fa040f9SShri Abhyankar s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt]; 25040b8f6341SShri Abhyankar 25050b8f6341SShri Abhyankar for (m=0; m<nz; m++) { 25060b8f6341SShri Abhyankar idx = bs*vi[m]; 25070b8f6341SShri Abhyankar for (k=0; k<15; k++) { 250853ef36baSBarry Smith xv = x[k + idx]; 250953ef36baSBarry Smith s[0] -= v[0]*xv; 251053ef36baSBarry Smith s[1] -= v[1]*xv; 251153ef36baSBarry Smith s[2] -= v[2]*xv; 251253ef36baSBarry Smith s[3] -= v[3]*xv; 251353ef36baSBarry Smith s[4] -= v[4]*xv; 251453ef36baSBarry Smith s[5] -= v[5]*xv; 251553ef36baSBarry Smith s[6] -= v[6]*xv; 251653ef36baSBarry Smith s[7] -= v[7]*xv; 251753ef36baSBarry Smith s[8] -= v[8]*xv; 251853ef36baSBarry Smith s[9] -= v[9]*xv; 251953ef36baSBarry Smith s[10] -= v[10]*xv; 252053ef36baSBarry Smith s[11] -= v[11]*xv; 252153ef36baSBarry Smith s[12] -= v[12]*xv; 252253ef36baSBarry Smith s[13] -= v[13]*xv; 252353ef36baSBarry Smith s[14] -= v[14]*xv; 25240b8f6341SShri Abhyankar v += 15; 25250b8f6341SShri Abhyankar } 25260b8f6341SShri Abhyankar } 25270fa040f9SShri Abhyankar ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr); 25280b8f6341SShri Abhyankar for (k=0; k<15; k++) { 25290fa040f9SShri Abhyankar x[idt] += v[0]*s[k]; 25300fa040f9SShri Abhyankar x[1+idt] += v[1]*s[k]; 25310fa040f9SShri Abhyankar x[2+idt] += v[2]*s[k]; 25320fa040f9SShri Abhyankar x[3+idt] += v[3]*s[k]; 25330fa040f9SShri Abhyankar x[4+idt] += v[4]*s[k]; 25340fa040f9SShri Abhyankar x[5+idt] += v[5]*s[k]; 25350fa040f9SShri Abhyankar x[6+idt] += v[6]*s[k]; 25360fa040f9SShri Abhyankar x[7+idt] += v[7]*s[k]; 25370fa040f9SShri Abhyankar x[8+idt] += v[8]*s[k]; 25380fa040f9SShri Abhyankar x[9+idt] += v[9]*s[k]; 25390fa040f9SShri Abhyankar x[10+idt] += v[10]*s[k]; 25400fa040f9SShri Abhyankar x[11+idt] += v[11]*s[k]; 25410fa040f9SShri Abhyankar x[12+idt] += v[12]*s[k]; 25420fa040f9SShri Abhyankar x[13+idt] += v[13]*s[k]; 25430fa040f9SShri Abhyankar x[14+idt] += v[14]*s[k]; 25440b8f6341SShri Abhyankar v += 15; 25450b8f6341SShri Abhyankar } 25460b8f6341SShri Abhyankar } 25473649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 25480b8f6341SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 25490b8f6341SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 25500b8f6341SShri Abhyankar PetscFunctionReturn(0); 25510b8f6341SShri Abhyankar } 25520b8f6341SShri Abhyankar 25530b8f6341SShri Abhyankar 25540b8f6341SShri Abhyankar #undef __FUNCT__ 255506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 255606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 25574e2b4712SSatish Balay { 25584e2b4712SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 25594e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 25606849ba73SBarry Smith PetscErrorCode ierr; 2561b3260449SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j; 2562b3260449SShri Abhyankar const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 2563b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 2564b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2565b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2566b3260449SShri Abhyankar const PetscScalar *b; 25674e2b4712SSatish Balay 25684e2b4712SSatish Balay PetscFunctionBegin; 25693649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 25701ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2571f1af5d2fSBarry Smith t = a->solve_work; 25724e2b4712SSatish Balay 25734e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 25744e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 25754e2b4712SSatish Balay 25764e2b4712SSatish Balay /* forward solve the lower triangular */ 25774e2b4712SSatish Balay idx = 7*(*r++); 2578f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2579f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2580f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 25814e2b4712SSatish Balay 25824e2b4712SSatish Balay for (i=1; i<n; i++) { 25834e2b4712SSatish Balay v = aa + 49*ai[i]; 25844e2b4712SSatish Balay vi = aj + ai[i]; 25854e2b4712SSatish Balay nz = diag[i] - ai[i]; 25864e2b4712SSatish Balay idx = 7*(*r++); 2587f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2588f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 25894e2b4712SSatish Balay while (nz--) { 25904e2b4712SSatish Balay idx = 7*(*vi++); 2591f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2592f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2593f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 2594f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2595f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2596f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2597f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2598f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2599f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2600f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 26014e2b4712SSatish Balay v += 49; 26024e2b4712SSatish Balay } 26034e2b4712SSatish Balay idx = 7*i; 2604f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2605f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2606f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 26074e2b4712SSatish Balay } 26084e2b4712SSatish Balay /* backward solve the upper triangular */ 26094e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 26104e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 26114e2b4712SSatish Balay vi = aj + diag[i] + 1; 26124e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 26134e2b4712SSatish Balay idt = 7*i; 2614f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2615f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2616f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 26174e2b4712SSatish Balay while (nz--) { 26184e2b4712SSatish Balay idx = 7*(*vi++); 2619f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2620f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2621f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 2622f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2623f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2624f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2625f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2626f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2627f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2628f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 26294e2b4712SSatish Balay v += 49; 26304e2b4712SSatish Balay } 26314e2b4712SSatish Balay idc = 7*(*c--); 26324e2b4712SSatish Balay v = aa + 49*diag[i]; 2633f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2634f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2635f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2636f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2637f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2638f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2639f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2640f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2641f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2642f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2643f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2644f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2645f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2646f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 26474e2b4712SSatish Balay } 26484e2b4712SSatish Balay 26494e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 26504e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 26513649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 26521ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2653dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 26544e2b4712SSatish Balay PetscFunctionReturn(0); 26554e2b4712SSatish Balay } 26564e2b4712SSatish Balay 26578f690400SShri Abhyankar #undef __FUNCT__ 26584dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7" 26594dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 266035aa4fcfSShri Abhyankar { 266135aa4fcfSShri Abhyankar Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 266235aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 266335aa4fcfSShri Abhyankar PetscErrorCode ierr; 2664b3260449SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 2665b3260449SShri Abhyankar const PetscInt n=a->mbs,*rout,*cout,*vi; 2666b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 2667b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2668b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2669b3260449SShri Abhyankar const PetscScalar *b; 267035aa4fcfSShri Abhyankar 267135aa4fcfSShri Abhyankar PetscFunctionBegin; 26723649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 267335aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 267435aa4fcfSShri Abhyankar t = a->solve_work; 267535aa4fcfSShri Abhyankar 267635aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 267735aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 267835aa4fcfSShri Abhyankar 267935aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 268035aa4fcfSShri Abhyankar idx = 7*r[0]; 268135aa4fcfSShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 268235aa4fcfSShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 268335aa4fcfSShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 268435aa4fcfSShri Abhyankar 268535aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 268635aa4fcfSShri Abhyankar v = aa + 49*ai[i]; 268735aa4fcfSShri Abhyankar vi = aj + ai[i]; 268835aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 268935aa4fcfSShri Abhyankar idx = 7*r[i]; 269035aa4fcfSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 269135aa4fcfSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 269235aa4fcfSShri Abhyankar for (m=0; m<nz; m++) { 269335aa4fcfSShri Abhyankar idx = 7*vi[m]; 269435aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 269535aa4fcfSShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 269635aa4fcfSShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 269735aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 269835aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 269935aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 270035aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 270135aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 270235aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 270335aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 270435aa4fcfSShri Abhyankar v += 49; 270535aa4fcfSShri Abhyankar } 270635aa4fcfSShri Abhyankar idx = 7*i; 270735aa4fcfSShri Abhyankar t[idx] = s1;t[1+idx] = s2; 270835aa4fcfSShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 270935aa4fcfSShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 271035aa4fcfSShri Abhyankar } 271135aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 271235aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--) { 271335aa4fcfSShri Abhyankar v = aa + 49*(adiag[i+1]+1); 271435aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 271535aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 271635aa4fcfSShri Abhyankar idt = 7*i; 271735aa4fcfSShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 271835aa4fcfSShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 271935aa4fcfSShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 272035aa4fcfSShri Abhyankar for (m=0; m<nz; m++) { 272135aa4fcfSShri Abhyankar idx = 7*vi[m]; 272235aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 272335aa4fcfSShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 272435aa4fcfSShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 272535aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 272635aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 272735aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 272835aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 272935aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 273035aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 273135aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 273235aa4fcfSShri Abhyankar v += 49; 273335aa4fcfSShri Abhyankar } 273435aa4fcfSShri Abhyankar idc = 7*c[i]; 273535aa4fcfSShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 273635aa4fcfSShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 273735aa4fcfSShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 273835aa4fcfSShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 273935aa4fcfSShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 274035aa4fcfSShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 274135aa4fcfSShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 274235aa4fcfSShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 274335aa4fcfSShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 274435aa4fcfSShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 274535aa4fcfSShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 274635aa4fcfSShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 274735aa4fcfSShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 274835aa4fcfSShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 274935aa4fcfSShri Abhyankar } 275035aa4fcfSShri Abhyankar 275135aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 275235aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 27533649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 275435aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 275535aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 275635aa4fcfSShri Abhyankar PetscFunctionReturn(0); 275735aa4fcfSShri Abhyankar } 275835aa4fcfSShri Abhyankar 275935aa4fcfSShri Abhyankar #undef __FUNCT__ 276006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 276106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 276215091d37SBarry Smith { 276315091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2764b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2765dfbe8321SBarry Smith PetscErrorCode ierr; 2766b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 2767d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2768d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2769d9fead3dSBarry Smith const PetscScalar *b; 277015091d37SBarry Smith 277115091d37SBarry Smith PetscFunctionBegin; 27723649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 27731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 277415091d37SBarry Smith /* forward solve the lower triangular */ 277515091d37SBarry Smith idx = 0; 277615091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 277715091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 277815091d37SBarry Smith x[6] = b[6+idx]; 277915091d37SBarry Smith for (i=1; i<n; i++) { 278015091d37SBarry Smith v = aa + 49*ai[i]; 278115091d37SBarry Smith vi = aj + ai[i]; 278215091d37SBarry Smith nz = diag[i] - ai[i]; 278315091d37SBarry Smith idx = 7*i; 2784f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2785f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2786f1af5d2fSBarry Smith s7 = b[6+idx]; 278715091d37SBarry Smith while (nz--) { 278815091d37SBarry Smith jdx = 7*(*vi++); 278915091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 279015091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 279115091d37SBarry Smith x7 = x[6+jdx]; 2792f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2793f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2794f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2795f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2796f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2797f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2798f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 279915091d37SBarry Smith v += 49; 280015091d37SBarry Smith } 2801f1af5d2fSBarry Smith x[idx] = s1; 2802f1af5d2fSBarry Smith x[1+idx] = s2; 2803f1af5d2fSBarry Smith x[2+idx] = s3; 2804f1af5d2fSBarry Smith x[3+idx] = s4; 2805f1af5d2fSBarry Smith x[4+idx] = s5; 2806f1af5d2fSBarry Smith x[5+idx] = s6; 2807f1af5d2fSBarry Smith x[6+idx] = s7; 280815091d37SBarry Smith } 280915091d37SBarry Smith /* backward solve the upper triangular */ 281015091d37SBarry Smith for (i=n-1; i>=0; i--) { 281115091d37SBarry Smith v = aa + 49*diag[i] + 49; 281215091d37SBarry Smith vi = aj + diag[i] + 1; 281315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 281415091d37SBarry Smith idt = 7*i; 2815f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2816f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 2817f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 2818f1af5d2fSBarry Smith s7 = x[6+idt]; 281915091d37SBarry Smith while (nz--) { 282015091d37SBarry Smith idx = 7*(*vi++); 282115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 282215091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 282315091d37SBarry Smith x7 = x[6+idx]; 2824f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2825f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2826f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2827f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2828f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2829f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2830f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 283115091d37SBarry Smith v += 49; 283215091d37SBarry Smith } 283315091d37SBarry Smith v = aa + 49*diag[i]; 2834f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2835f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 2836f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2837f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 2838f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2839f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 2840f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2841f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 2842f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2843f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 2844f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2845f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 2846f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2847f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 284815091d37SBarry Smith } 284915091d37SBarry Smith 28503649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 28511ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2852dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 285315091d37SBarry Smith PetscFunctionReturn(0); 285415091d37SBarry Smith } 285515091d37SBarry Smith 2856cee9d6f2SShri Abhyankar #undef __FUNCT__ 28574dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 28584dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 285953cca76cSShri Abhyankar { 286053cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 2861b3260449SShri Abhyankar const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 286253cca76cSShri Abhyankar PetscErrorCode ierr; 2863b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 2864b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 286553cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 286653cca76cSShri Abhyankar PetscScalar *x; 286753cca76cSShri Abhyankar const PetscScalar *b; 286853cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 286953cca76cSShri Abhyankar 287053cca76cSShri Abhyankar PetscFunctionBegin; 28713649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 287253cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 287353cca76cSShri Abhyankar /* forward solve the lower triangular */ 287453cca76cSShri Abhyankar idx = 0; 287553cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 287653cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 287753cca76cSShri Abhyankar for (i=1; i<n; i++) { 287853cca76cSShri Abhyankar v = aa + bs2*ai[i]; 287953cca76cSShri Abhyankar vi = aj + ai[i]; 288053cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 288153cca76cSShri Abhyankar idx = bs*i; 288253cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 288353cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 288453cca76cSShri Abhyankar for (k=0; k<nz; k++) { 288553cca76cSShri Abhyankar jdx = bs*vi[k]; 288653cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 288753cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 288853cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 288953cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 289053cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 289153cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 289253cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 289353cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 289453cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 289553cca76cSShri Abhyankar v += bs2; 289653cca76cSShri Abhyankar } 289753cca76cSShri Abhyankar 289853cca76cSShri Abhyankar x[idx] = s1; 289953cca76cSShri Abhyankar x[1+idx] = s2; 290053cca76cSShri Abhyankar x[2+idx] = s3; 290153cca76cSShri Abhyankar x[3+idx] = s4; 290253cca76cSShri Abhyankar x[4+idx] = s5; 290353cca76cSShri Abhyankar x[5+idx] = s6; 290453cca76cSShri Abhyankar x[6+idx] = s7; 290553cca76cSShri Abhyankar } 290653cca76cSShri Abhyankar 290753cca76cSShri Abhyankar /* backward solve the upper triangular */ 290853cca76cSShri Abhyankar for (i=n-1; i>=0; i--) { 290953cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 291053cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 291153cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 291253cca76cSShri Abhyankar idt = bs*i; 291353cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 291453cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 291553cca76cSShri Abhyankar for (k=0; k<nz; k++) { 291653cca76cSShri Abhyankar idx = bs*vi[k]; 291753cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 291853cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 291953cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 292053cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 292153cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 292253cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 292353cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 292453cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 292553cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 292653cca76cSShri Abhyankar v += bs2; 292753cca76cSShri Abhyankar } 292853cca76cSShri Abhyankar /* x = inv_diagonal*x */ 292953cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 293053cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 293153cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 293253cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 293353cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 293453cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 293553cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 293653cca76cSShri Abhyankar } 293753cca76cSShri Abhyankar 29383649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 293953cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 294053cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 294153cca76cSShri Abhyankar PetscFunctionReturn(0); 294253cca76cSShri Abhyankar } 294353cca76cSShri Abhyankar 294453cca76cSShri Abhyankar #undef __FUNCT__ 294506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 294606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 294715091d37SBarry Smith { 294815091d37SBarry Smith Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 294915091d37SBarry Smith IS iscol=a->col,isrow=a->row; 29506849ba73SBarry Smith PetscErrorCode ierr; 29515d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 2952b3260449SShri Abhyankar const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2953b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 2954d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2955d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2956d9fead3dSBarry Smith const PetscScalar *b; 2957b3260449SShri Abhyankar 295815091d37SBarry Smith PetscFunctionBegin; 29593649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 29601ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2961f1af5d2fSBarry Smith t = a->solve_work; 296215091d37SBarry Smith 296315091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 296415091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 296515091d37SBarry Smith 296615091d37SBarry Smith /* forward solve the lower triangular */ 296715091d37SBarry Smith idx = 6*(*r++); 2968f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2969f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 2970f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 297115091d37SBarry Smith for (i=1; i<n; i++) { 297215091d37SBarry Smith v = aa + 36*ai[i]; 297315091d37SBarry Smith vi = aj + ai[i]; 297415091d37SBarry Smith nz = diag[i] - ai[i]; 297515091d37SBarry Smith idx = 6*(*r++); 2976f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2977f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 297815091d37SBarry Smith while (nz--) { 297915091d37SBarry Smith idx = 6*(*vi++); 2980f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2981f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2982f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2983f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2984f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2985f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2986f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2987f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 298815091d37SBarry Smith v += 36; 298915091d37SBarry Smith } 299015091d37SBarry Smith idx = 6*i; 2991f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2992f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 2993f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 299415091d37SBarry Smith } 299515091d37SBarry Smith /* backward solve the upper triangular */ 299615091d37SBarry Smith for (i=n-1; i>=0; i--) { 299715091d37SBarry Smith v = aa + 36*diag[i] + 36; 299815091d37SBarry Smith vi = aj + diag[i] + 1; 299915091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 300015091d37SBarry Smith idt = 6*i; 3001f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3002f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 3003f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 300415091d37SBarry Smith while (nz--) { 300515091d37SBarry Smith idx = 6*(*vi++); 3006f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3007f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 3008f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 3009f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3010f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3011f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3012f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3013f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3014f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 301515091d37SBarry Smith v += 36; 301615091d37SBarry Smith } 301715091d37SBarry Smith idc = 6*(*c--); 301815091d37SBarry Smith v = aa + 36*diag[i]; 3019f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3020f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 3021f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3022f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 3023f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3024f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 3025f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3026f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 3027f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3028f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 3029f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3030f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 303115091d37SBarry Smith } 303215091d37SBarry Smith 303315091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 303415091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 30353649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 30361ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3037dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 303815091d37SBarry Smith PetscFunctionReturn(0); 303915091d37SBarry Smith } 304015091d37SBarry Smith 30416506fda5SShri Abhyankar #undef __FUNCT__ 30424dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6" 30434dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 30446506fda5SShri Abhyankar { 30456506fda5SShri Abhyankar Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 30466506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 30476506fda5SShri Abhyankar PetscErrorCode ierr; 30486506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 3049b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3050b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 30516506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 30526506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 30536506fda5SShri Abhyankar const PetscScalar *b; 3054b3260449SShri Abhyankar 30556506fda5SShri Abhyankar PetscFunctionBegin; 30563649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 30576506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 30586506fda5SShri Abhyankar t = a->solve_work; 30596506fda5SShri Abhyankar 30606506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 30616506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 30626506fda5SShri Abhyankar 30636506fda5SShri Abhyankar /* forward solve the lower triangular */ 30646506fda5SShri Abhyankar idx = 6*r[0]; 30656506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 30666506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 30676506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 30686506fda5SShri Abhyankar for (i=1; i<n; i++) { 30696506fda5SShri Abhyankar v = aa + 36*ai[i]; 30706506fda5SShri Abhyankar vi = aj + ai[i]; 30716506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 30726506fda5SShri Abhyankar idx = 6*r[i]; 30736506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 30746506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 30756506fda5SShri Abhyankar for (m=0; m<nz; m++) { 30766506fda5SShri Abhyankar idx = 6*vi[m]; 30776506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 30786506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 30796506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 30806506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 30816506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 30826506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 30836506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 30846506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 30856506fda5SShri Abhyankar v += 36; 30866506fda5SShri Abhyankar } 30876506fda5SShri Abhyankar idx = 6*i; 30886506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 30896506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 30906506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 30916506fda5SShri Abhyankar } 30926506fda5SShri Abhyankar /* backward solve the upper triangular */ 30936506fda5SShri Abhyankar for (i=n-1; i>=0; i--) { 30946506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 30956506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 30966506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 30976506fda5SShri Abhyankar idt = 6*i; 30986506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 30996506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 31006506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 31016506fda5SShri Abhyankar for (m=0; m<nz; m++) { 31026506fda5SShri Abhyankar idx = 6*vi[m]; 31036506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 31046506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 31056506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 31066506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 31076506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 31086506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 31096506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 31106506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 31116506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 31126506fda5SShri Abhyankar v += 36; 31136506fda5SShri Abhyankar } 31146506fda5SShri Abhyankar idc = 6*c[i]; 31156506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 31166506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 31176506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 31186506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 31196506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 31206506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 31216506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 31226506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 31236506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 31246506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 31256506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 31266506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 31276506fda5SShri Abhyankar } 31286506fda5SShri Abhyankar 31296506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 31306506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 31313649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 31326506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 31336506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 31346506fda5SShri Abhyankar PetscFunctionReturn(0); 31356506fda5SShri Abhyankar } 31368f690400SShri Abhyankar 31378f690400SShri Abhyankar #undef __FUNCT__ 313806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 313906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 314015091d37SBarry Smith { 314115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3142b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 3143dfbe8321SBarry Smith PetscErrorCode ierr; 3144b3260449SShri Abhyankar const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 3145d9fead3dSBarry Smith const MatScalar *aa =a->a,*v; 3146d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3147d9fead3dSBarry Smith const PetscScalar *b; 314815091d37SBarry Smith 314915091d37SBarry Smith PetscFunctionBegin; 31503649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 31511ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 315215091d37SBarry Smith /* forward solve the lower triangular */ 315315091d37SBarry Smith idx = 0; 315415091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 315515091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 315615091d37SBarry Smith for (i=1; i<n; i++) { 315715091d37SBarry Smith v = aa + 36*ai[i]; 315815091d37SBarry Smith vi = aj + ai[i]; 315915091d37SBarry Smith nz = diag[i] - ai[i]; 316015091d37SBarry Smith idx = 6*i; 3161f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3162f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 316315091d37SBarry Smith while (nz--) { 316415091d37SBarry Smith jdx = 6*(*vi++); 316515091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 316615091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 3167f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3168f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3169f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3170f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3171f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3172f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 317315091d37SBarry Smith v += 36; 317415091d37SBarry Smith } 3175f1af5d2fSBarry Smith x[idx] = s1; 3176f1af5d2fSBarry Smith x[1+idx] = s2; 3177f1af5d2fSBarry Smith x[2+idx] = s3; 3178f1af5d2fSBarry Smith x[3+idx] = s4; 3179f1af5d2fSBarry Smith x[4+idx] = s5; 3180f1af5d2fSBarry Smith x[5+idx] = s6; 318115091d37SBarry Smith } 318215091d37SBarry Smith /* backward solve the upper triangular */ 318315091d37SBarry Smith for (i=n-1; i>=0; i--) { 318415091d37SBarry Smith v = aa + 36*diag[i] + 36; 318515091d37SBarry Smith vi = aj + diag[i] + 1; 318615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 318715091d37SBarry Smith idt = 6*i; 3188f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3189f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 3190f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 319115091d37SBarry Smith while (nz--) { 319215091d37SBarry Smith idx = 6*(*vi++); 319315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 319415091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3195f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3196f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3197f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3198f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3199f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3200f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 320115091d37SBarry Smith v += 36; 320215091d37SBarry Smith } 320315091d37SBarry Smith v = aa + 36*diag[i]; 3204f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3205f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3206f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3207f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3208f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3209f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 321015091d37SBarry Smith } 321115091d37SBarry Smith 32123649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 32131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3214dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 321515091d37SBarry Smith PetscFunctionReturn(0); 321615091d37SBarry Smith } 321715091d37SBarry Smith 3218cee9d6f2SShri Abhyankar #undef __FUNCT__ 32194dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 32204dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 322153cca76cSShri Abhyankar { 322253cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3223b3260449SShri Abhyankar const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 322453cca76cSShri Abhyankar PetscErrorCode ierr; 3225b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 3226b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 322753cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 322853cca76cSShri Abhyankar PetscScalar *x; 322953cca76cSShri Abhyankar const PetscScalar *b; 323053cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 323153cca76cSShri Abhyankar 323253cca76cSShri Abhyankar PetscFunctionBegin; 32333649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 323453cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 323553cca76cSShri Abhyankar /* forward solve the lower triangular */ 323653cca76cSShri Abhyankar idx = 0; 323753cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 323853cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 323953cca76cSShri Abhyankar for (i=1; i<n; i++) { 324053cca76cSShri Abhyankar v = aa + bs2*ai[i]; 324153cca76cSShri Abhyankar vi = aj + ai[i]; 324253cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 324353cca76cSShri Abhyankar idx = bs*i; 324453cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 324553cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 324653cca76cSShri Abhyankar for (k=0; k<nz; k++) { 324753cca76cSShri Abhyankar jdx = bs*vi[k]; 324853cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 324953cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 325053cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 325153cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 325253cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 325353cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 325453cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 325553cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 325653cca76cSShri Abhyankar v += bs2; 325753cca76cSShri Abhyankar } 325853cca76cSShri Abhyankar 325953cca76cSShri Abhyankar x[idx] = s1; 326053cca76cSShri Abhyankar x[1+idx] = s2; 326153cca76cSShri Abhyankar x[2+idx] = s3; 326253cca76cSShri Abhyankar x[3+idx] = s4; 326353cca76cSShri Abhyankar x[4+idx] = s5; 326453cca76cSShri Abhyankar x[5+idx] = s6; 326553cca76cSShri Abhyankar } 326653cca76cSShri Abhyankar 326753cca76cSShri Abhyankar /* backward solve the upper triangular */ 326853cca76cSShri Abhyankar for (i=n-1; i>=0; i--) { 326953cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 327053cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 327153cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 327253cca76cSShri Abhyankar idt = bs*i; 327353cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 327453cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 327553cca76cSShri Abhyankar for (k=0; k<nz; k++) { 327653cca76cSShri Abhyankar idx = bs*vi[k]; 327753cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 327853cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 327953cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 328053cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 328153cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 328253cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 328353cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 328453cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 328553cca76cSShri Abhyankar v += bs2; 328653cca76cSShri Abhyankar } 328753cca76cSShri Abhyankar /* x = inv_diagonal*x */ 328853cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 328953cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 329053cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 329153cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 329253cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 329353cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 329453cca76cSShri Abhyankar } 329553cca76cSShri Abhyankar 32963649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 329753cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 329853cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 329953cca76cSShri Abhyankar PetscFunctionReturn(0); 330053cca76cSShri Abhyankar } 330153cca76cSShri Abhyankar 330253cca76cSShri Abhyankar #undef __FUNCT__ 330306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 330406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 33054e2b4712SSatish Balay { 33064e2b4712SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 33074e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 33086849ba73SBarry Smith PetscErrorCode ierr; 33095d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3310b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3311b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 3312d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3313d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3314d9fead3dSBarry Smith const PetscScalar *b; 33154e2b4712SSatish Balay 33164e2b4712SSatish Balay PetscFunctionBegin; 33173649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 33181ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3319f1af5d2fSBarry Smith t = a->solve_work; 33204e2b4712SSatish Balay 33214e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 33224e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 33234e2b4712SSatish Balay 33244e2b4712SSatish Balay /* forward solve the lower triangular */ 33254e2b4712SSatish Balay idx = 5*(*r++); 3326f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3327f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 33284e2b4712SSatish Balay for (i=1; i<n; i++) { 33294e2b4712SSatish Balay v = aa + 25*ai[i]; 33304e2b4712SSatish Balay vi = aj + ai[i]; 33314e2b4712SSatish Balay nz = diag[i] - ai[i]; 33324e2b4712SSatish Balay idx = 5*(*r++); 3333f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3334f1af5d2fSBarry Smith s5 = b[4+idx]; 33354e2b4712SSatish Balay while (nz--) { 33364e2b4712SSatish Balay idx = 5*(*vi++); 3337f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3338f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 3339f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3340f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3341f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3342f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3343f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 33444e2b4712SSatish Balay v += 25; 33454e2b4712SSatish Balay } 33464e2b4712SSatish Balay idx = 5*i; 3347f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3348f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 33494e2b4712SSatish Balay } 33504e2b4712SSatish Balay /* backward solve the upper triangular */ 33514e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 33524e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 33534e2b4712SSatish Balay vi = aj + diag[i] + 1; 33544e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 33554e2b4712SSatish Balay idt = 5*i; 3356f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3357f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 33584e2b4712SSatish Balay while (nz--) { 33594e2b4712SSatish Balay idx = 5*(*vi++); 3360f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3361f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3362f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3363f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3364f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3365f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3366f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 33674e2b4712SSatish Balay v += 25; 33684e2b4712SSatish Balay } 33694e2b4712SSatish Balay idc = 5*(*c--); 33704e2b4712SSatish Balay v = aa + 25*diag[i]; 3371f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3372f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 3373f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3374f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 3375f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3376f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 3377f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3378f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 3379f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3380f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 33814e2b4712SSatish Balay } 33824e2b4712SSatish Balay 33834e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 33844e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 33853649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 33861ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3387dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 33884e2b4712SSatish Balay PetscFunctionReturn(0); 33894e2b4712SSatish Balay } 33904e2b4712SSatish Balay 339178bb4007SShri Abhyankar #undef __FUNCT__ 33924dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5" 33934dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 339478bb4007SShri Abhyankar { 339578bb4007SShri Abhyankar Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 339678bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 339778bb4007SShri Abhyankar PetscErrorCode ierr; 339878bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 3399b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3400b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 340178bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 340278bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 340378bb4007SShri Abhyankar const PetscScalar *b; 340478bb4007SShri Abhyankar 340578bb4007SShri Abhyankar PetscFunctionBegin; 34063649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 340778bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 340878bb4007SShri Abhyankar t = a->solve_work; 340978bb4007SShri Abhyankar 341078bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 341178bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 341278bb4007SShri Abhyankar 341378bb4007SShri Abhyankar /* forward solve the lower triangular */ 341478bb4007SShri Abhyankar idx = 5*r[0]; 341578bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 341678bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 341778bb4007SShri Abhyankar for (i=1; i<n; i++) { 341878bb4007SShri Abhyankar v = aa + 25*ai[i]; 341978bb4007SShri Abhyankar vi = aj + ai[i]; 342078bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 342178bb4007SShri Abhyankar idx = 5*r[i]; 342278bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 342378bb4007SShri Abhyankar s5 = b[4+idx]; 342478bb4007SShri Abhyankar for (m=0; m<nz; m++) { 342578bb4007SShri Abhyankar idx = 5*vi[m]; 342678bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 342778bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 342878bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 342978bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 343078bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 343178bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 343278bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 343378bb4007SShri Abhyankar v += 25; 343478bb4007SShri Abhyankar } 343578bb4007SShri Abhyankar idx = 5*i; 343678bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 343778bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 343878bb4007SShri Abhyankar } 343978bb4007SShri Abhyankar /* backward solve the upper triangular */ 344078bb4007SShri Abhyankar for (i=n-1; i>=0; i--) { 344178bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 344278bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 344378bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 344478bb4007SShri Abhyankar idt = 5*i; 344578bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 344678bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 344778bb4007SShri Abhyankar for (m=0; m<nz; m++) { 344878bb4007SShri Abhyankar idx = 5*vi[m]; 344978bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 345078bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 345178bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 345278bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 345378bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 345478bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 345578bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 345678bb4007SShri Abhyankar v += 25; 345778bb4007SShri Abhyankar } 345878bb4007SShri Abhyankar idc = 5*c[i]; 345978bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 346078bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 346178bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 346278bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 346378bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 346478bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 346578bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 346678bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 346778bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 346878bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 346978bb4007SShri Abhyankar } 347078bb4007SShri Abhyankar 347178bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 347278bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 34733649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 347478bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 347578bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 347678bb4007SShri Abhyankar PetscFunctionReturn(0); 347778bb4007SShri Abhyankar } 347878bb4007SShri Abhyankar 34798f690400SShri Abhyankar #undef __FUNCT__ 348006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 348106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 348215091d37SBarry Smith { 348315091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3484b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3485b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 3486dfbe8321SBarry Smith PetscErrorCode ierr; 3487d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3488d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3489d9fead3dSBarry Smith const PetscScalar *b; 349015091d37SBarry Smith 349115091d37SBarry Smith PetscFunctionBegin; 34923649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 34931ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 349415091d37SBarry Smith /* forward solve the lower triangular */ 349515091d37SBarry Smith idx = 0; 349615091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 349715091d37SBarry Smith for (i=1; i<n; i++) { 349815091d37SBarry Smith v = aa + 25*ai[i]; 349915091d37SBarry Smith vi = aj + ai[i]; 350015091d37SBarry Smith nz = diag[i] - ai[i]; 350115091d37SBarry Smith idx = 5*i; 3502f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 350315091d37SBarry Smith while (nz--) { 350415091d37SBarry Smith jdx = 5*(*vi++); 350515091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3506f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3507f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3508f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3509f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3510f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 351115091d37SBarry Smith v += 25; 351215091d37SBarry Smith } 3513f1af5d2fSBarry Smith x[idx] = s1; 3514f1af5d2fSBarry Smith x[1+idx] = s2; 3515f1af5d2fSBarry Smith x[2+idx] = s3; 3516f1af5d2fSBarry Smith x[3+idx] = s4; 3517f1af5d2fSBarry Smith x[4+idx] = s5; 351815091d37SBarry Smith } 351915091d37SBarry Smith /* backward solve the upper triangular */ 352015091d37SBarry Smith for (i=n-1; i>=0; i--) { 352115091d37SBarry Smith v = aa + 25*diag[i] + 25; 352215091d37SBarry Smith vi = aj + diag[i] + 1; 352315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 352415091d37SBarry Smith idt = 5*i; 3525f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3526f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 352715091d37SBarry Smith while (nz--) { 352815091d37SBarry Smith idx = 5*(*vi++); 352915091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3530f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3531f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3532f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3533f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3534f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 353515091d37SBarry Smith v += 25; 353615091d37SBarry Smith } 353715091d37SBarry Smith v = aa + 25*diag[i]; 3538f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3539f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3540f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3541f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3542f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 354315091d37SBarry Smith } 354415091d37SBarry Smith 35453649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 35461ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3547dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 354815091d37SBarry Smith PetscFunctionReturn(0); 354915091d37SBarry Smith } 355015091d37SBarry Smith 3551cee9d6f2SShri Abhyankar #undef __FUNCT__ 35524dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 35534dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 355453cca76cSShri Abhyankar { 355553cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3556b3260449SShri Abhyankar const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3557b3260449SShri Abhyankar PetscInt i,k,nz,idx,idt,jdx; 355853cca76cSShri Abhyankar PetscErrorCode ierr; 355953cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 356053cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 356153cca76cSShri Abhyankar const PetscScalar *b; 356253cca76cSShri Abhyankar 356353cca76cSShri Abhyankar PetscFunctionBegin; 35643649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 356553cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 356653cca76cSShri Abhyankar /* forward solve the lower triangular */ 356753cca76cSShri Abhyankar idx = 0; 356853cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 356953cca76cSShri Abhyankar for (i=1; i<n; i++) { 357053cca76cSShri Abhyankar v = aa + 25*ai[i]; 357153cca76cSShri Abhyankar vi = aj + ai[i]; 357253cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 357353cca76cSShri Abhyankar idx = 5*i; 357453cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 357553cca76cSShri Abhyankar for (k=0; k<nz; k++) { 357653cca76cSShri Abhyankar jdx = 5*vi[k]; 357753cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 357853cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 357953cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 358053cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 358153cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 358253cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 358353cca76cSShri Abhyankar v += 25; 358453cca76cSShri Abhyankar } 358553cca76cSShri Abhyankar x[idx] = s1; 358653cca76cSShri Abhyankar x[1+idx] = s2; 358753cca76cSShri Abhyankar x[2+idx] = s3; 358853cca76cSShri Abhyankar x[3+idx] = s4; 358953cca76cSShri Abhyankar x[4+idx] = s5; 359053cca76cSShri Abhyankar } 359153cca76cSShri Abhyankar 359253cca76cSShri Abhyankar /* backward solve the upper triangular */ 359353cca76cSShri Abhyankar for (i=n-1; i>=0; i--) { 359453cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 359553cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 359653cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 359753cca76cSShri Abhyankar idt = 5*i; 359853cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 359953cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 360053cca76cSShri Abhyankar for (k=0; k<nz; k++) { 360153cca76cSShri Abhyankar idx = 5*vi[k]; 360253cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 360353cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 360453cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 360553cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 360653cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 360753cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 360853cca76cSShri Abhyankar v += 25; 360953cca76cSShri Abhyankar } 361053cca76cSShri Abhyankar /* x = inv_diagonal*x */ 361153cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 361253cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 361353cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 361453cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 361553cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 361653cca76cSShri Abhyankar } 361753cca76cSShri Abhyankar 36183649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 361953cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 362053cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 362153cca76cSShri Abhyankar PetscFunctionReturn(0); 362253cca76cSShri Abhyankar } 362353cca76cSShri Abhyankar 362453cca76cSShri Abhyankar #undef __FUNCT__ 362506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 362606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 36274e2b4712SSatish Balay { 36284e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 36294e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 36306849ba73SBarry Smith PetscErrorCode ierr; 3631b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3632b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 36335d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3634d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3635d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3636d9fead3dSBarry Smith const PetscScalar *b; 36374e2b4712SSatish Balay 36384e2b4712SSatish Balay PetscFunctionBegin; 36393649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 36401ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3641f1af5d2fSBarry Smith t = a->solve_work; 36424e2b4712SSatish Balay 36434e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 36444e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 36454e2b4712SSatish Balay 36464e2b4712SSatish Balay /* forward solve the lower triangular */ 36474e2b4712SSatish Balay idx = 4*(*r++); 3648f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3649f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 36504e2b4712SSatish Balay for (i=1; i<n; i++) { 36514e2b4712SSatish Balay v = aa + 16*ai[i]; 36524e2b4712SSatish Balay vi = aj + ai[i]; 36534e2b4712SSatish Balay nz = diag[i] - ai[i]; 36544e2b4712SSatish Balay idx = 4*(*r++); 3655f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 36564e2b4712SSatish Balay while (nz--) { 36574e2b4712SSatish Balay idx = 4*(*vi++); 3658f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3659f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3660f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3661f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3662f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 36634e2b4712SSatish Balay v += 16; 36644e2b4712SSatish Balay } 36654e2b4712SSatish Balay idx = 4*i; 3666f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3667f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 36684e2b4712SSatish Balay } 36694e2b4712SSatish Balay /* backward solve the upper triangular */ 36704e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 36714e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 36724e2b4712SSatish Balay vi = aj + diag[i] + 1; 36734e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 36744e2b4712SSatish Balay idt = 4*i; 3675f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3676f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 36774e2b4712SSatish Balay while (nz--) { 36784e2b4712SSatish Balay idx = 4*(*vi++); 3679f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3680f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 3681f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3682f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3683f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3684f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 36854e2b4712SSatish Balay v += 16; 36864e2b4712SSatish Balay } 36874e2b4712SSatish Balay idc = 4*(*c--); 36884e2b4712SSatish Balay v = aa + 16*diag[i]; 3689f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3690f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3691f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3692f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 36934e2b4712SSatish Balay } 36944e2b4712SSatish Balay 36954e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 36964e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 36973649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 36981ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3699dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 37004e2b4712SSatish Balay PetscFunctionReturn(0); 37014e2b4712SSatish Balay } 3702f26ec98cSKris Buschelman 37038f690400SShri Abhyankar #undef __FUNCT__ 37044dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4" 37054dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 370678bb4007SShri Abhyankar { 370778bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 370878bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 370978bb4007SShri Abhyankar PetscErrorCode ierr; 3710b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3711b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 371278bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 371378bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 371478bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 371578bb4007SShri Abhyankar const PetscScalar *b; 371678bb4007SShri Abhyankar 371778bb4007SShri Abhyankar PetscFunctionBegin; 37183649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 371978bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 372078bb4007SShri Abhyankar t = a->solve_work; 372178bb4007SShri Abhyankar 372278bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 372378bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 372478bb4007SShri Abhyankar 372578bb4007SShri Abhyankar /* forward solve the lower triangular */ 372678bb4007SShri Abhyankar idx = 4*r[0]; 372778bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 372878bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 372978bb4007SShri Abhyankar for (i=1; i<n; i++) { 373078bb4007SShri Abhyankar v = aa + 16*ai[i]; 373178bb4007SShri Abhyankar vi = aj + ai[i]; 373278bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 373378bb4007SShri Abhyankar idx = 4*r[i]; 373478bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 373578bb4007SShri Abhyankar for (m=0; m<nz; m++) { 373678bb4007SShri Abhyankar idx = 4*vi[m]; 373778bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 373878bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 373978bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 374078bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 374178bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 374278bb4007SShri Abhyankar v += 16; 374378bb4007SShri Abhyankar } 374478bb4007SShri Abhyankar idx = 4*i; 374578bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 374678bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 374778bb4007SShri Abhyankar } 374878bb4007SShri Abhyankar /* backward solve the upper triangular */ 374978bb4007SShri Abhyankar for (i=n-1; i>=0; i--) { 375078bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 375178bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 375278bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 375378bb4007SShri Abhyankar idt = 4*i; 375478bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 375578bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 375678bb4007SShri Abhyankar for (m=0; m<nz; m++) { 375778bb4007SShri Abhyankar idx = 4*vi[m]; 375878bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 375978bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 376078bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 376178bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 376278bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 376378bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 376478bb4007SShri Abhyankar v += 16; 376578bb4007SShri Abhyankar } 376678bb4007SShri Abhyankar idc = 4*c[i]; 376778bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 376878bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 376978bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 377078bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 377178bb4007SShri Abhyankar } 377278bb4007SShri Abhyankar 377378bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 377478bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 37753649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 377678bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 377778bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 377878bb4007SShri Abhyankar PetscFunctionReturn(0); 377978bb4007SShri Abhyankar } 378078bb4007SShri Abhyankar 378178bb4007SShri Abhyankar #undef __FUNCT__ 3782f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3783dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3784f26ec98cSKris Buschelman { 3785f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 3786f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 37876849ba73SBarry Smith PetscErrorCode ierr; 3788b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3789b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 37905d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3791d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3792d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3793d9fead3dSBarry Smith PetscScalar *x; 3794d9fead3dSBarry Smith const PetscScalar *b; 3795f26ec98cSKris Buschelman 3796f26ec98cSKris Buschelman PetscFunctionBegin; 37973649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 37981ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3799f26ec98cSKris Buschelman t = (MatScalar*)a->solve_work; 3800f26ec98cSKris Buschelman 3801f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3802f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3803f26ec98cSKris Buschelman 3804f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3805f26ec98cSKris Buschelman idx = 4*(*r++); 3806f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 3807f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 3808f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 3809f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 3810f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3811f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3812f26ec98cSKris Buschelman vi = aj + ai[i]; 3813f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3814f26ec98cSKris Buschelman idx = 4*(*r++); 3815f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3816f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3817f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3818f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3819f26ec98cSKris Buschelman while (nz--) { 3820f26ec98cSKris Buschelman idx = 4*(*vi++); 3821f26ec98cSKris Buschelman x1 = t[idx]; 3822f26ec98cSKris Buschelman x2 = t[1+idx]; 3823f26ec98cSKris Buschelman x3 = t[2+idx]; 3824f26ec98cSKris Buschelman x4 = t[3+idx]; 3825f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3826f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3827f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3828f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3829f26ec98cSKris Buschelman v += 16; 3830f26ec98cSKris Buschelman } 3831f26ec98cSKris Buschelman idx = 4*i; 3832f26ec98cSKris Buschelman t[idx] = s1; 3833f26ec98cSKris Buschelman t[1+idx] = s2; 3834f26ec98cSKris Buschelman t[2+idx] = s3; 3835f26ec98cSKris Buschelman t[3+idx] = s4; 3836f26ec98cSKris Buschelman } 3837f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3838f26ec98cSKris Buschelman for (i=n-1; i>=0; i--) { 3839f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 3840f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3841f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3842f26ec98cSKris Buschelman idt = 4*i; 3843f26ec98cSKris Buschelman s1 = t[idt]; 3844f26ec98cSKris Buschelman s2 = t[1+idt]; 3845f26ec98cSKris Buschelman s3 = t[2+idt]; 3846f26ec98cSKris Buschelman s4 = t[3+idt]; 3847f26ec98cSKris Buschelman while (nz--) { 3848f26ec98cSKris Buschelman idx = 4*(*vi++); 3849f26ec98cSKris Buschelman x1 = t[idx]; 3850f26ec98cSKris Buschelman x2 = t[1+idx]; 3851f26ec98cSKris Buschelman x3 = t[2+idx]; 3852f26ec98cSKris Buschelman x4 = t[3+idx]; 3853f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3854f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3855f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3856f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3857f26ec98cSKris Buschelman v += 16; 3858f26ec98cSKris Buschelman } 3859f26ec98cSKris Buschelman idc = 4*(*c--); 3860f26ec98cSKris Buschelman v = aa + 16*diag[i]; 3861f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3862f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3863f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3864f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3865f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 3866f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 3867f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 3868f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 3869f26ec98cSKris Buschelman } 3870f26ec98cSKris Buschelman 3871f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3872f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 38733649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 38741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3875dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3876f26ec98cSKris Buschelman PetscFunctionReturn(0); 3877f26ec98cSKris Buschelman } 3878f26ec98cSKris Buschelman 387924c233c2SKris Buschelman #if defined(PETSC_HAVE_SSE) 388024c233c2SKris Buschelman 388124c233c2SKris Buschelman #include PETSC_HAVE_SSE 388224c233c2SKris Buschelman 388324c233c2SKris Buschelman #undef __FUNCT__ 388424c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3885dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 388624c233c2SKris Buschelman { 388724c233c2SKris Buschelman /* 388824c233c2SKris Buschelman Note: This code uses demotion of double 388924c233c2SKris Buschelman to float when performing the mixed-mode computation. 389024c233c2SKris Buschelman This may not be numerically reasonable for all applications. 389124c233c2SKris Buschelman */ 389224c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 389324c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 38946849ba73SBarry Smith PetscErrorCode ierr; 38955d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 38965d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 389724c233c2SKris Buschelman MatScalar *aa=a->a,*v; 389887828ca2SBarry Smith PetscScalar *x,*b,*t; 389924c233c2SKris Buschelman 390024c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 390124c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 390224c233c2SKris Buschelman unsigned long offset; 390324c233c2SKris Buschelman 390424c233c2SKris Buschelman PetscFunctionBegin; 390524c233c2SKris Buschelman SSE_SCOPE_BEGIN; 390624c233c2SKris Buschelman 390724c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 390824c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 390924c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 391024c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 391124c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 391224c233c2SKris Buschelman 39131ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 39141ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 391524c233c2SKris Buschelman t = a->solve_work; 391624c233c2SKris Buschelman 391724c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 391824c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 391924c233c2SKris Buschelman 392024c233c2SKris Buschelman /* forward solve the lower triangular */ 392124c233c2SKris Buschelman idx = 4*(*r++); 392224c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 392324c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 392424c233c2SKris Buschelman v = aa + 16*ai[1]; 392524c233c2SKris Buschelman 392624c233c2SKris Buschelman for (i=1; i<n; ) { 392724c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 392824c233c2SKris Buschelman vi = aj + ai[i]; 392924c233c2SKris Buschelman nz = diag[i] - ai[i]; 393024c233c2SKris Buschelman idx = 4*(*r++); 393124c233c2SKris Buschelman 393224c233c2SKris Buschelman /* Demote sum from double to float */ 393324c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 393424c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 393524c233c2SKris Buschelman 393624c233c2SKris Buschelman while (nz--) { 393724c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 393824c233c2SKris Buschelman idx = 4*(*vi++); 393924c233c2SKris Buschelman 394024c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 394124c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 394224c233c2SKris Buschelman 394324c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 394424c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 394524c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 394624c233c2SKris Buschelman 394724c233c2SKris Buschelman /* First Column */ 394824c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 394924c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 395024c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 395124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 395224c233c2SKris Buschelman 395324c233c2SKris Buschelman /* Second Column */ 395424c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 395524c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 395624c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 395724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 395824c233c2SKris Buschelman 395924c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 396024c233c2SKris Buschelman 396124c233c2SKris Buschelman /* Third Column */ 396224c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 396324c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 396424c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 396524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 396624c233c2SKris Buschelman 396724c233c2SKris Buschelman /* Fourth Column */ 396824c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 396924c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 397024c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 397124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 397224c233c2SKris Buschelman SSE_INLINE_END_2 397324c233c2SKris Buschelman 397424c233c2SKris Buschelman v += 16; 397524c233c2SKris Buschelman } 397624c233c2SKris Buschelman idx = 4*i; 397724c233c2SKris Buschelman v = aa + 16*ai[++i]; 397824c233c2SKris Buschelman PREFETCH_NTA(v); 397924c233c2SKris Buschelman STORE_PS(tmps,XMM7); 398024c233c2SKris Buschelman 398124c233c2SKris Buschelman /* Promote result from float to double */ 398224c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 398324c233c2SKris Buschelman } 398424c233c2SKris Buschelman /* backward solve the upper triangular */ 398524c233c2SKris Buschelman idt = 4*(n-1); 398624c233c2SKris Buschelman ai16 = 16*diag[n-1]; 398724c233c2SKris Buschelman v = aa + ai16 + 16; 398824c233c2SKris Buschelman for (i=n-1; i>=0; ) { 398924c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 399024c233c2SKris Buschelman vi = aj + diag[i] + 1; 399124c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 399224c233c2SKris Buschelman 399324c233c2SKris Buschelman /* Demote accumulator from double to float */ 399424c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 399524c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 399624c233c2SKris Buschelman 399724c233c2SKris Buschelman while (nz--) { 399824c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 399924c233c2SKris Buschelman idx = 4*(*vi++); 400024c233c2SKris Buschelman 400124c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 400224c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 400324c233c2SKris Buschelman 400424c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 400524c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 400624c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 400724c233c2SKris Buschelman 400824c233c2SKris Buschelman /* First Column */ 400924c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 401024c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 401124c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 401224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 401324c233c2SKris Buschelman 401424c233c2SKris Buschelman /* Second Column */ 401524c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 401624c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 401724c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 401824c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 401924c233c2SKris Buschelman 402024c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 402124c233c2SKris Buschelman 402224c233c2SKris Buschelman /* Third Column */ 402324c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 402424c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 402524c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 402624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 402724c233c2SKris Buschelman 402824c233c2SKris Buschelman /* Fourth Column */ 402924c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 403024c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 403124c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 403224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 403324c233c2SKris Buschelman SSE_INLINE_END_2 403424c233c2SKris Buschelman v += 16; 403524c233c2SKris Buschelman } 403624c233c2SKris Buschelman v = aa + ai16; 403724c233c2SKris Buschelman ai16 = 16*diag[--i]; 403824c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 403924c233c2SKris Buschelman /* 404024c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 404124c233c2SKris Buschelman which was inverted as part of the factorization 404224c233c2SKris Buschelman */ 404324c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 404424c233c2SKris Buschelman /* First Column */ 404524c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 404624c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 404724c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 404824c233c2SKris Buschelman 404924c233c2SKris Buschelman /* Second Column */ 405024c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 405124c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 405224c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 405324c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 405424c233c2SKris Buschelman 405524c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 405624c233c2SKris Buschelman 405724c233c2SKris Buschelman /* Third Column */ 405824c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 405924c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 406024c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 406124c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 406224c233c2SKris Buschelman 406324c233c2SKris Buschelman /* Fourth Column */ 406424c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 406524c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 406624c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 406724c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 406824c233c2SKris Buschelman 406924c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 407024c233c2SKris Buschelman SSE_INLINE_END_3 407124c233c2SKris Buschelman 407224c233c2SKris Buschelman /* Promote solution from float to double */ 407324c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 407424c233c2SKris Buschelman 407524c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 407624c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 407724c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 407824c233c2SKris Buschelman idc = 4*(*c--); 407924c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc]) 408024c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 408124c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 408224c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 408324c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 408424c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 408524c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 408624c233c2SKris Buschelman SSE_INLINE_END_2 408724c233c2SKris Buschelman v = aa + ai16 + 16; 408824c233c2SKris Buschelman idt -= 4; 408924c233c2SKris Buschelman } 409024c233c2SKris Buschelman 409124c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 409224c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 40931ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 40941ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4095dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 409624c233c2SKris Buschelman SSE_SCOPE_END; 409724c233c2SKris Buschelman PetscFunctionReturn(0); 409824c233c2SKris Buschelman } 409924c233c2SKris Buschelman 410024c233c2SKris Buschelman #endif 41010ef38995SBarry Smith 41020ef38995SBarry Smith 41034e2b4712SSatish Balay /* 41044e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 41054e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 41064e2b4712SSatish Balay */ 41074a2ae208SSatish Balay #undef __FUNCT__ 410806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 410906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 41104e2b4712SSatish Balay { 41114e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 4112356650c2SBarry Smith PetscInt n =a->mbs; 4113356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 4114dfbe8321SBarry Smith PetscErrorCode ierr; 4115356650c2SBarry Smith const PetscInt *diag = a->diag; 4116d9fead3dSBarry Smith const MatScalar *aa =a->a; 4117d9fead3dSBarry Smith PetscScalar *x; 4118d9fead3dSBarry Smith const PetscScalar *b; 41194e2b4712SSatish Balay 41204e2b4712SSatish Balay PetscFunctionBegin; 41213649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 41221ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 41234e2b4712SSatish Balay 4124aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 41252853dc0eSBarry Smith { 412687828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 41272853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 41282853dc0eSBarry Smith } 4129aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 41302853dc0eSBarry Smith { 413187828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 41322853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 41332853dc0eSBarry Smith } 4134aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 41352853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 4136e1293385SBarry Smith #else 413730d4dcafSBarry Smith { 413887828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4139d9fead3dSBarry Smith const MatScalar *v; 4140356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 4141356650c2SBarry Smith const PetscInt *vi; 4142e1293385SBarry Smith 41434e2b4712SSatish Balay /* forward solve the lower triangular */ 41444e2b4712SSatish Balay idx = 0; 4145e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 41464e2b4712SSatish Balay for (i=1; i<n; i++) { 41474e2b4712SSatish Balay v = aa + 16*ai[i]; 41484e2b4712SSatish Balay vi = aj + ai[i]; 41494e2b4712SSatish Balay nz = diag[i] - ai[i]; 4150e1293385SBarry Smith idx += 4; 4151f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 41524e2b4712SSatish Balay while (nz--) { 41534e2b4712SSatish Balay jdx = 4*(*vi++); 41544e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 4155f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4156f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4157f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4158f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 41594e2b4712SSatish Balay v += 16; 41604e2b4712SSatish Balay } 4161f1af5d2fSBarry Smith x[idx] = s1; 4162f1af5d2fSBarry Smith x[1+idx] = s2; 4163f1af5d2fSBarry Smith x[2+idx] = s3; 4164f1af5d2fSBarry Smith x[3+idx] = s4; 41654e2b4712SSatish Balay } 41664e2b4712SSatish Balay /* backward solve the upper triangular */ 41674e555682SBarry Smith idt = 4*(n-1); 41684e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 41694e555682SBarry Smith ai16 = 16*diag[i]; 41704e555682SBarry Smith v = aa + ai16 + 16; 41714e2b4712SSatish Balay vi = aj + diag[i] + 1; 41724e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4173f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4174f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 41754e2b4712SSatish Balay while (nz--) { 41764e2b4712SSatish Balay idx = 4*(*vi++); 41774e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 4178f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4179f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4180f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4181f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 41824e2b4712SSatish Balay v += 16; 41834e2b4712SSatish Balay } 41844e555682SBarry Smith v = aa + ai16; 4185f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4186f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4187f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4188f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4189329f5518SBarry Smith idt -= 4; 41904e2b4712SSatish Balay } 419130d4dcafSBarry Smith } 4192e1293385SBarry Smith #endif 41934e2b4712SSatish Balay 41943649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 41951ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4196dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 41974e2b4712SSatish Balay PetscFunctionReturn(0); 41984e2b4712SSatish Balay } 41994e2b4712SSatish Balay 4200b2b2dd24SShri Abhyankar #undef __FUNCT__ 42014dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 42024dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4203b2b2dd24SShri Abhyankar { 4204b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4205b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4206b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 4207b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4208b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4209b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4210b2b2dd24SShri Abhyankar PetscScalar *x; 4211b2b2dd24SShri Abhyankar const PetscScalar *b; 4212b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4213cee9d6f2SShri Abhyankar 4214b2b2dd24SShri Abhyankar PetscFunctionBegin; 42153649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4216b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4217b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4218b2b2dd24SShri Abhyankar idx = 0; 4219b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4220b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4221b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4222b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4223b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4224b2b2dd24SShri Abhyankar idx = bs*i; 4225b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4226b2b2dd24SShri Abhyankar for (k=0; k<nz; k++) { 4227b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4228b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4229b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4230b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4231b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4232b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4233b2b2dd24SShri Abhyankar 4234b2b2dd24SShri Abhyankar v += bs2; 4235b2b2dd24SShri Abhyankar } 4236b2b2dd24SShri Abhyankar 4237b2b2dd24SShri Abhyankar x[idx] = s1; 4238b2b2dd24SShri Abhyankar x[1+idx] = s2; 4239b2b2dd24SShri Abhyankar x[2+idx] = s3; 4240b2b2dd24SShri Abhyankar x[3+idx] = s4; 4241b2b2dd24SShri Abhyankar } 4242b2b2dd24SShri Abhyankar 4243b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4244b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--) { 4245b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4246b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4247b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4248b2b2dd24SShri Abhyankar idt = bs*i; 4249b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4250b2b2dd24SShri Abhyankar 4251b2b2dd24SShri Abhyankar for (k=0; k<nz; k++) { 4252b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4253b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4254b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4255b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4256b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4257b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4258b2b2dd24SShri Abhyankar 4259b2b2dd24SShri Abhyankar v += bs2; 4260b2b2dd24SShri Abhyankar } 4261b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4262b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4263b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4264b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4265b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4266b2b2dd24SShri Abhyankar 4267b2b2dd24SShri Abhyankar } 4268b2b2dd24SShri Abhyankar 42693649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4270b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4271b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4272b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4273b2b2dd24SShri Abhyankar } 4274cee9d6f2SShri Abhyankar 4275cee9d6f2SShri Abhyankar #undef __FUNCT__ 4276f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4277dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4278f26ec98cSKris Buschelman { 4279f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 4280b3260449SShri Abhyankar const PetscInt n =a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 4281dfbe8321SBarry Smith PetscErrorCode ierr; 4282b3260449SShri Abhyankar const MatScalar *aa=a->a; 4283b3260449SShri Abhyankar const PetscScalar *b; 4284b3260449SShri Abhyankar PetscScalar *x; 4285f26ec98cSKris Buschelman 4286f26ec98cSKris Buschelman PetscFunctionBegin; 42873649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 42881ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4289f26ec98cSKris Buschelman 4290f26ec98cSKris Buschelman { 4291f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4292b3260449SShri Abhyankar const MatScalar *v; 4293b3260449SShri Abhyankar MatScalar *t=(MatScalar*)x; 4294b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i,ai16; 4295b3260449SShri Abhyankar const PetscInt *vi; 4296f26ec98cSKris Buschelman 4297f26ec98cSKris Buschelman /* forward solve the lower triangular */ 4298f26ec98cSKris Buschelman idx = 0; 4299f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 4300f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 4301f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 4302f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 4303f26ec98cSKris Buschelman for (i=1; i<n; i++) { 4304f26ec98cSKris Buschelman v = aa + 16*ai[i]; 4305f26ec98cSKris Buschelman vi = aj + ai[i]; 4306f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 4307f26ec98cSKris Buschelman idx += 4; 4308f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 4309f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 4310f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 4311f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 4312f26ec98cSKris Buschelman while (nz--) { 4313f26ec98cSKris Buschelman jdx = 4*(*vi++); 4314f26ec98cSKris Buschelman x1 = t[jdx]; 4315f26ec98cSKris Buschelman x2 = t[1+jdx]; 4316f26ec98cSKris Buschelman x3 = t[2+jdx]; 4317f26ec98cSKris Buschelman x4 = t[3+jdx]; 4318f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4319f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4320f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4321f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4322f26ec98cSKris Buschelman v += 16; 4323f26ec98cSKris Buschelman } 4324f26ec98cSKris Buschelman t[idx] = s1; 4325f26ec98cSKris Buschelman t[1+idx] = s2; 4326f26ec98cSKris Buschelman t[2+idx] = s3; 4327f26ec98cSKris Buschelman t[3+idx] = s4; 4328f26ec98cSKris Buschelman } 4329f26ec98cSKris Buschelman /* backward solve the upper triangular */ 4330f26ec98cSKris Buschelman idt = 4*(n-1); 4331f26ec98cSKris Buschelman for (i=n-1; i>=0; i--) { 4332f26ec98cSKris Buschelman ai16 = 16*diag[i]; 4333f26ec98cSKris Buschelman v = aa + ai16 + 16; 4334f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 4335f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 4336f26ec98cSKris Buschelman s1 = t[idt]; 4337f26ec98cSKris Buschelman s2 = t[1+idt]; 4338f26ec98cSKris Buschelman s3 = t[2+idt]; 4339f26ec98cSKris Buschelman s4 = t[3+idt]; 4340f26ec98cSKris Buschelman while (nz--) { 4341f26ec98cSKris Buschelman idx = 4*(*vi++); 4342f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 4343f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 4344f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 4345f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 4346f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4347f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4348f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4349f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4350f26ec98cSKris Buschelman v += 16; 4351f26ec98cSKris Buschelman } 4352f26ec98cSKris Buschelman v = aa + ai16; 4353f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4354f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4355f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4356f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4357f26ec98cSKris Buschelman idt -= 4; 4358f26ec98cSKris Buschelman } 4359f26ec98cSKris Buschelman } 4360f26ec98cSKris Buschelman 43613649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 43621ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4363dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4364f26ec98cSKris Buschelman PetscFunctionReturn(0); 4365f26ec98cSKris Buschelman } 4366f26ec98cSKris Buschelman 43673660e330SKris Buschelman #if defined(PETSC_HAVE_SSE) 43683660e330SKris Buschelman 43693660e330SKris Buschelman #include PETSC_HAVE_SSE 43703660e330SKris Buschelman #undef __FUNCT__ 43717cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4372dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 43733660e330SKris Buschelman { 43743660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 43752aa5897fSKris Buschelman unsigned short *aj=(unsigned short*)a->j; 4376dfbe8321SBarry Smith PetscErrorCode ierr; 4377dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 43783660e330SKris Buschelman MatScalar *aa=a->a; 437987828ca2SBarry Smith PetscScalar *x,*b; 43803660e330SKris Buschelman 43813660e330SKris Buschelman PetscFunctionBegin; 43823660e330SKris Buschelman SSE_SCOPE_BEGIN; 43833660e330SKris Buschelman /* 43843660e330SKris Buschelman Note: This code currently uses demotion of double 43853660e330SKris Buschelman to float when performing the mixed-mode computation. 43863660e330SKris Buschelman This may not be numerically reasonable for all applications. 43873660e330SKris Buschelman */ 43883660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 43893660e330SKris Buschelman 43901ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 43911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 43923660e330SKris Buschelman { 4393eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 4394eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar*)x; 43952aa5897fSKris Buschelman int nz,i,idt,ai16; 43962aa5897fSKris Buschelman unsigned int jdx,idx; 43972aa5897fSKris Buschelman unsigned short *vi; 4398eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 43993660e330SKris Buschelman 4400eb05f457SKris Buschelman /* First block is the identity. */ 44013660e330SKris Buschelman idx = 0; 4402eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 44032aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 44043660e330SKris Buschelman 44053660e330SKris Buschelman for (i=1; i<n; ) { 44063660e330SKris Buschelman PREFETCH_NTA(&v[8]); 44073660e330SKris Buschelman vi = aj + ai[i]; 44083660e330SKris Buschelman nz = diag[i] - ai[i]; 44093660e330SKris Buschelman idx += 4; 44103660e330SKris Buschelman 4411eb05f457SKris Buschelman /* Demote RHS from double to float. */ 4412eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4413eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 44143660e330SKris Buschelman 44153660e330SKris Buschelman while (nz--) { 44163660e330SKris Buschelman PREFETCH_NTA(&v[16]); 44172aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 44183660e330SKris Buschelman 44193660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 4420eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 44213660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 44223660e330SKris Buschelman 44233660e330SKris Buschelman /* First Column */ 44243660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 44253660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 44263660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 44273660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 44283660e330SKris Buschelman 44293660e330SKris Buschelman /* Second Column */ 44303660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 44313660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 44323660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 44333660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 44343660e330SKris Buschelman 44353660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 44363660e330SKris Buschelman 44373660e330SKris Buschelman /* Third Column */ 44383660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 44393660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 44403660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 44413660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 44423660e330SKris Buschelman 44433660e330SKris Buschelman /* Fourth Column */ 44443660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 44453660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 44463660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 44473660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 44483660e330SKris Buschelman SSE_INLINE_END_2 44493660e330SKris Buschelman 44503660e330SKris Buschelman v += 16; 44513660e330SKris Buschelman } 44523660e330SKris Buschelman v = aa + 16*ai[++i]; 44533660e330SKris Buschelman PREFETCH_NTA(v); 4454eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 44553660e330SKris Buschelman } 4456eb05f457SKris Buschelman 4457eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 4458eb05f457SKris Buschelman 44593660e330SKris Buschelman idt = 4*(n-1); 44603660e330SKris Buschelman ai16 = 16*diag[n-1]; 44613660e330SKris Buschelman v = aa + ai16 + 16; 44623660e330SKris Buschelman for (i=n-1; i>=0; ) { 44633660e330SKris Buschelman PREFETCH_NTA(&v[8]); 44643660e330SKris Buschelman vi = aj + diag[i] + 1; 44653660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 44663660e330SKris Buschelman 4467eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 44683660e330SKris Buschelman 44693660e330SKris Buschelman while (nz--) { 44703660e330SKris Buschelman PREFETCH_NTA(&v[16]); 44712aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 44723660e330SKris Buschelman 44733660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 4474eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 44753660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 44763660e330SKris Buschelman 44773660e330SKris Buschelman /* First Column */ 44783660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 44793660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 44803660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 44813660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 44823660e330SKris Buschelman 44833660e330SKris Buschelman /* Second Column */ 44843660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 44853660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 44863660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 44873660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 44883660e330SKris Buschelman 44893660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 44903660e330SKris Buschelman 44913660e330SKris Buschelman /* Third Column */ 44923660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 44933660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 44943660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 44953660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 44963660e330SKris Buschelman 44973660e330SKris Buschelman /* Fourth Column */ 44983660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 44993660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 45003660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 45013660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 45023660e330SKris Buschelman SSE_INLINE_END_2 45033660e330SKris Buschelman v += 16; 45043660e330SKris Buschelman } 45053660e330SKris Buschelman v = aa + ai16; 45063660e330SKris Buschelman ai16 = 16*diag[--i]; 45073660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 45083660e330SKris Buschelman /* 45093660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 45103660e330SKris Buschelman which was inverted as part of the factorization 45113660e330SKris Buschelman */ 4512eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 45133660e330SKris Buschelman /* First Column */ 45143660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 45153660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 45163660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 45173660e330SKris Buschelman 45183660e330SKris Buschelman /* Second Column */ 45193660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 45203660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 45213660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 45223660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 45233660e330SKris Buschelman 45243660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 45253660e330SKris Buschelman 45263660e330SKris Buschelman /* Third Column */ 45273660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 45283660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 45293660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 45303660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 45313660e330SKris Buschelman 45323660e330SKris Buschelman /* Fourth Column */ 45333660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 45343660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 45353660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 45363660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 45373660e330SKris Buschelman 45383660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 45393660e330SKris Buschelman SSE_INLINE_END_3 45403660e330SKris Buschelman 45413660e330SKris Buschelman v = aa + ai16 + 16; 45423660e330SKris Buschelman idt -= 4; 45433660e330SKris Buschelman } 4544eb05f457SKris Buschelman 4545eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 4546eb05f457SKris Buschelman idt = 4*(n-1); 4547eb05f457SKris Buschelman for (i=n-1; i>=0; i--) { 4548eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4549eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4550eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 4551eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 4552eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 4553eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 4554eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 4555eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 455654693613SKris Buschelman idt -= 4; 45573660e330SKris Buschelman } 4558eb05f457SKris Buschelman 4559eb05f457SKris Buschelman } /* End of artificial scope. */ 45601ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 45611ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4562dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 45633660e330SKris Buschelman SSE_SCOPE_END; 45643660e330SKris Buschelman PetscFunctionReturn(0); 45653660e330SKris Buschelman } 45663660e330SKris Buschelman 45677cf1b8d3SKris Buschelman #undef __FUNCT__ 45687cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4569dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 45707cf1b8d3SKris Buschelman { 45717cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 45727cf1b8d3SKris Buschelman int *aj=a->j; 4573dfbe8321SBarry Smith PetscErrorCode ierr; 4574dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 45757cf1b8d3SKris Buschelman MatScalar *aa=a->a; 45767cf1b8d3SKris Buschelman PetscScalar *x,*b; 45777cf1b8d3SKris Buschelman 45787cf1b8d3SKris Buschelman PetscFunctionBegin; 45797cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 45807cf1b8d3SKris Buschelman /* 45817cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 45827cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 45837cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 45847cf1b8d3SKris Buschelman */ 45857cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 45867cf1b8d3SKris Buschelman 45871ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 45881ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 45897cf1b8d3SKris Buschelman { 45907cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 45917cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar*)x; 45927cf1b8d3SKris Buschelman int nz,i,idt,ai16; 45937cf1b8d3SKris Buschelman int jdx,idx; 45947cf1b8d3SKris Buschelman int *vi; 45957cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 45967cf1b8d3SKris Buschelman 45977cf1b8d3SKris Buschelman /* First block is the identity. */ 45987cf1b8d3SKris Buschelman idx = 0; 45997cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 46007cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 46017cf1b8d3SKris Buschelman 46027cf1b8d3SKris Buschelman for (i=1; i<n; ) { 46037cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 46047cf1b8d3SKris Buschelman vi = aj + ai[i]; 46057cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 46067cf1b8d3SKris Buschelman idx += 4; 46077cf1b8d3SKris Buschelman 46087cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 46097cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 46107cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 46117cf1b8d3SKris Buschelman 46127cf1b8d3SKris Buschelman while (nz--) { 46137cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 46147cf1b8d3SKris Buschelman jdx = 4*(*vi++); 46157cf1b8d3SKris Buschelman /* jdx = *vi++; */ 46167cf1b8d3SKris Buschelman 46177cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 46187cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 46197cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 46207cf1b8d3SKris Buschelman 46217cf1b8d3SKris Buschelman /* First Column */ 46227cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 46237cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 46247cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 46257cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 46267cf1b8d3SKris Buschelman 46277cf1b8d3SKris Buschelman /* Second Column */ 46287cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 46297cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 46307cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 46317cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 46327cf1b8d3SKris Buschelman 46337cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 46347cf1b8d3SKris Buschelman 46357cf1b8d3SKris Buschelman /* Third Column */ 46367cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 46377cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 46387cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 46397cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 46407cf1b8d3SKris Buschelman 46417cf1b8d3SKris Buschelman /* Fourth Column */ 46427cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 46437cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 46447cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 46457cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 46467cf1b8d3SKris Buschelman SSE_INLINE_END_2 46477cf1b8d3SKris Buschelman 46487cf1b8d3SKris Buschelman v += 16; 46497cf1b8d3SKris Buschelman } 46507cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 46517cf1b8d3SKris Buschelman PREFETCH_NTA(v); 46527cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 46537cf1b8d3SKris Buschelman } 46547cf1b8d3SKris Buschelman 46557cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 46567cf1b8d3SKris Buschelman 46577cf1b8d3SKris Buschelman idt = 4*(n-1); 46587cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 46597cf1b8d3SKris Buschelman v = aa + ai16 + 16; 46607cf1b8d3SKris Buschelman for (i=n-1; i>=0; ) { 46617cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 46627cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 46637cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 46647cf1b8d3SKris Buschelman 46657cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 46667cf1b8d3SKris Buschelman 46677cf1b8d3SKris Buschelman while (nz--) { 46687cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 46697cf1b8d3SKris Buschelman idx = 4*(*vi++); 46707cf1b8d3SKris Buschelman /* idx = *vi++; */ 46717cf1b8d3SKris Buschelman 46727cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 46737cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 46747cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 46757cf1b8d3SKris Buschelman 46767cf1b8d3SKris Buschelman /* First Column */ 46777cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 46787cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 46797cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 46807cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 46817cf1b8d3SKris Buschelman 46827cf1b8d3SKris Buschelman /* Second Column */ 46837cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 46847cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 46857cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 46867cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 46877cf1b8d3SKris Buschelman 46887cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 46897cf1b8d3SKris Buschelman 46907cf1b8d3SKris Buschelman /* Third Column */ 46917cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 46927cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 46937cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 46947cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 46957cf1b8d3SKris Buschelman 46967cf1b8d3SKris Buschelman /* Fourth Column */ 46977cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 46987cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 46997cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 47007cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 47017cf1b8d3SKris Buschelman SSE_INLINE_END_2 47027cf1b8d3SKris Buschelman v += 16; 47037cf1b8d3SKris Buschelman } 47047cf1b8d3SKris Buschelman v = aa + ai16; 47057cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 47067cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 47077cf1b8d3SKris Buschelman /* 47087cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 47097cf1b8d3SKris Buschelman which was inverted as part of the factorization 47107cf1b8d3SKris Buschelman */ 47117cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 47127cf1b8d3SKris Buschelman /* First Column */ 47137cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 47147cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 47157cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 47167cf1b8d3SKris Buschelman 47177cf1b8d3SKris Buschelman /* Second Column */ 47187cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 47197cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 47207cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 47217cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 47227cf1b8d3SKris Buschelman 47237cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 47247cf1b8d3SKris Buschelman 47257cf1b8d3SKris Buschelman /* Third Column */ 47267cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 47277cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 47287cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 47297cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 47307cf1b8d3SKris Buschelman 47317cf1b8d3SKris Buschelman /* Fourth Column */ 47327cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 47337cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 47347cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 47357cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 47367cf1b8d3SKris Buschelman 47377cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 47387cf1b8d3SKris Buschelman SSE_INLINE_END_3 47397cf1b8d3SKris Buschelman 47407cf1b8d3SKris Buschelman v = aa + ai16 + 16; 47417cf1b8d3SKris Buschelman idt -= 4; 47427cf1b8d3SKris Buschelman } 47437cf1b8d3SKris Buschelman 47447cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 47457cf1b8d3SKris Buschelman idt = 4*(n-1); 47467cf1b8d3SKris Buschelman for (i=n-1; i>=0; i--) { 47477cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 47487cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 47497cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 47507cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 47517cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 47527cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 47537cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 47547cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 47557cf1b8d3SKris Buschelman idt -= 4; 47567cf1b8d3SKris Buschelman } 47577cf1b8d3SKris Buschelman 47587cf1b8d3SKris Buschelman } /* End of artificial scope. */ 47591ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 47601ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4761dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 47627cf1b8d3SKris Buschelman SSE_SCOPE_END; 47637cf1b8d3SKris Buschelman PetscFunctionReturn(0); 47647cf1b8d3SKris Buschelman } 47657cf1b8d3SKris Buschelman 47663660e330SKris Buschelman #endif 47678f690400SShri Abhyankar 47684a2ae208SSatish Balay #undef __FUNCT__ 476906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 477006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 47714e2b4712SSatish Balay { 47724e2b4712SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 47734e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 47746849ba73SBarry Smith PetscErrorCode ierr; 4775b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 4776b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 47775d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4778d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4779d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4780d9fead3dSBarry Smith const PetscScalar *b; 47814e2b4712SSatish Balay 47824e2b4712SSatish Balay PetscFunctionBegin; 47833649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 47841ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4785f1af5d2fSBarry Smith t = a->solve_work; 47864e2b4712SSatish Balay 47874e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 47884e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 47894e2b4712SSatish Balay 47904e2b4712SSatish Balay /* forward solve the lower triangular */ 47914e2b4712SSatish Balay idx = 3*(*r++); 4792f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 47934e2b4712SSatish Balay for (i=1; i<n; i++) { 47944e2b4712SSatish Balay v = aa + 9*ai[i]; 47954e2b4712SSatish Balay vi = aj + ai[i]; 47964e2b4712SSatish Balay nz = diag[i] - ai[i]; 47974e2b4712SSatish Balay idx = 3*(*r++); 4798f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 47994e2b4712SSatish Balay while (nz--) { 48004e2b4712SSatish Balay idx = 3*(*vi++); 4801f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4802f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4803f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4804f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48054e2b4712SSatish Balay v += 9; 48064e2b4712SSatish Balay } 48074e2b4712SSatish Balay idx = 3*i; 4808f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 48094e2b4712SSatish Balay } 48104e2b4712SSatish Balay /* backward solve the upper triangular */ 48114e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 48124e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 48134e2b4712SSatish Balay vi = aj + diag[i] + 1; 48144e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 48154e2b4712SSatish Balay idt = 3*i; 4816f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 48174e2b4712SSatish Balay while (nz--) { 48184e2b4712SSatish Balay idx = 3*(*vi++); 4819f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4820f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4821f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4822f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48234e2b4712SSatish Balay v += 9; 48244e2b4712SSatish Balay } 48254e2b4712SSatish Balay idc = 3*(*c--); 48264e2b4712SSatish Balay v = aa + 9*diag[i]; 4827f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4828f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4829f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 48304e2b4712SSatish Balay } 48314e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 48324e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 48333649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 48341ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4835dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 48364e2b4712SSatish Balay PetscFunctionReturn(0); 48374e2b4712SSatish Balay } 48384e2b4712SSatish Balay 48390c4413a7SShri Abhyankar #undef __FUNCT__ 48404dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3" 48414dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 48420c4413a7SShri Abhyankar { 48430c4413a7SShri Abhyankar Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 48440c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 48450c4413a7SShri Abhyankar PetscErrorCode ierr; 4846b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4847b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 48480c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 48490c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 48500c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 48510c4413a7SShri Abhyankar const PetscScalar *b; 48520c4413a7SShri Abhyankar 48530c4413a7SShri Abhyankar PetscFunctionBegin; 48543649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 48550c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 48560c4413a7SShri Abhyankar t = a->solve_work; 48570c4413a7SShri Abhyankar 48580c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 48590c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 48600c4413a7SShri Abhyankar 48610c4413a7SShri Abhyankar /* forward solve the lower triangular */ 48620c4413a7SShri Abhyankar idx = 3*r[0]; 48630c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 48640c4413a7SShri Abhyankar for (i=1; i<n; i++) { 48650c4413a7SShri Abhyankar v = aa + 9*ai[i]; 48660c4413a7SShri Abhyankar vi = aj + ai[i]; 48670c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 48680c4413a7SShri Abhyankar idx = 3*r[i]; 48690c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 48700c4413a7SShri Abhyankar for (m=0; m<nz; m++) { 48710c4413a7SShri Abhyankar idx = 3*vi[m]; 48720c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 48730c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 48740c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 48750c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48760c4413a7SShri Abhyankar v += 9; 48770c4413a7SShri Abhyankar } 48780c4413a7SShri Abhyankar idx = 3*i; 48790c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 48800c4413a7SShri Abhyankar } 48810c4413a7SShri Abhyankar /* backward solve the upper triangular */ 48820c4413a7SShri Abhyankar for (i=n-1; i>=0; i--) { 48830c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 48840c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 48850c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 48860c4413a7SShri Abhyankar idt = 3*i; 48870c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 48880c4413a7SShri Abhyankar for (m=0; m<nz; m++) { 48890c4413a7SShri Abhyankar idx = 3*vi[m]; 48900c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 48910c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 48920c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 48930c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48940c4413a7SShri Abhyankar v += 9; 48950c4413a7SShri Abhyankar } 48960c4413a7SShri Abhyankar idc = 3*c[i]; 48970c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 48980c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 48990c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 49000c4413a7SShri Abhyankar } 49010c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 49020c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 49033649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 49040c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 49050c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 49060c4413a7SShri Abhyankar PetscFunctionReturn(0); 49070c4413a7SShri Abhyankar } 49080c4413a7SShri Abhyankar 490915091d37SBarry Smith /* 491015091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 491115091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 491215091d37SBarry Smith */ 49134a2ae208SSatish Balay #undef __FUNCT__ 491406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 491506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 491615091d37SBarry Smith { 491715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 49180b68f018SBarry Smith const PetscInt n =a->mbs,*ai=a->i,*aj=a->j; 4919dfbe8321SBarry Smith PetscErrorCode ierr; 49200b68f018SBarry Smith const PetscInt *diag = a->diag,*vi; 4921d9fead3dSBarry Smith const MatScalar *aa =a->a,*v; 4922d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 4923d9fead3dSBarry Smith const PetscScalar *b; 49240b68f018SBarry Smith PetscInt jdx,idt,idx,nz,i; 492515091d37SBarry Smith 492615091d37SBarry Smith PetscFunctionBegin; 49273649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 49281ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 492915091d37SBarry Smith 493015091d37SBarry Smith /* forward solve the lower triangular */ 493115091d37SBarry Smith idx = 0; 493215091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 493315091d37SBarry Smith for (i=1; i<n; i++) { 493415091d37SBarry Smith v = aa + 9*ai[i]; 493515091d37SBarry Smith vi = aj + ai[i]; 493615091d37SBarry Smith nz = diag[i] - ai[i]; 493715091d37SBarry Smith idx += 3; 4938f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 493915091d37SBarry Smith while (nz--) { 494015091d37SBarry Smith jdx = 3*(*vi++); 494115091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4942f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4943f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4944f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 494515091d37SBarry Smith v += 9; 494615091d37SBarry Smith } 4947f1af5d2fSBarry Smith x[idx] = s1; 4948f1af5d2fSBarry Smith x[1+idx] = s2; 4949f1af5d2fSBarry Smith x[2+idx] = s3; 495015091d37SBarry Smith } 495115091d37SBarry Smith /* backward solve the upper triangular */ 495215091d37SBarry Smith for (i=n-1; i>=0; i--) { 495315091d37SBarry Smith v = aa + 9*diag[i] + 9; 495415091d37SBarry Smith vi = aj + diag[i] + 1; 495515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 495615091d37SBarry Smith idt = 3*i; 4957f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4958f1af5d2fSBarry Smith s3 = x[2+idt]; 495915091d37SBarry Smith while (nz--) { 496015091d37SBarry Smith idx = 3*(*vi++); 496115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4962f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4963f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4964f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 496515091d37SBarry Smith v += 9; 496615091d37SBarry Smith } 496715091d37SBarry Smith v = aa + 9*diag[i]; 4968f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4969f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4970f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 497115091d37SBarry Smith } 497215091d37SBarry Smith 49733649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 49741ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4975dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 497615091d37SBarry Smith PetscFunctionReturn(0); 497715091d37SBarry Smith } 497815091d37SBarry Smith 4979cee9d6f2SShri Abhyankar #undef __FUNCT__ 49804dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 49814dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4982b2b2dd24SShri Abhyankar { 4983b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 4984b3260449SShri Abhyankar const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4985b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4986b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 4987b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4988b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4989b2b2dd24SShri Abhyankar PetscScalar *x; 4990b2b2dd24SShri Abhyankar const PetscScalar *b; 4991b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4992b2b2dd24SShri Abhyankar 4993b2b2dd24SShri Abhyankar PetscFunctionBegin; 49943649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4995b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4996b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4997b2b2dd24SShri Abhyankar idx = 0; 4998b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4999b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 5000b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 5001b2b2dd24SShri Abhyankar vi = aj + ai[i]; 5002b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 5003b2b2dd24SShri Abhyankar idx = bs*i; 5004b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 5005b2b2dd24SShri Abhyankar for (k=0; k<nz; k++) { 5006b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 5007b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 5008b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5009b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5010b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5011b2b2dd24SShri Abhyankar 5012b2b2dd24SShri Abhyankar v += bs2; 5013b2b2dd24SShri Abhyankar } 5014b2b2dd24SShri Abhyankar 5015b2b2dd24SShri Abhyankar x[idx] = s1; 5016b2b2dd24SShri Abhyankar x[1+idx] = s2; 5017b2b2dd24SShri Abhyankar x[2+idx] = s3; 5018b2b2dd24SShri Abhyankar } 5019b2b2dd24SShri Abhyankar 5020b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 5021b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--) { 5022b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 5023b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 5024b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5025b2b2dd24SShri Abhyankar idt = bs*i; 5026b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 5027b2b2dd24SShri Abhyankar 5028b2b2dd24SShri Abhyankar for (k=0; k<nz; k++) { 5029b2b2dd24SShri Abhyankar idx = bs*vi[k]; 5030b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 5031b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5032b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5033b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5034b2b2dd24SShri Abhyankar 5035b2b2dd24SShri Abhyankar v += bs2; 5036b2b2dd24SShri Abhyankar } 5037b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 5038b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5039b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5040b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 5041b2b2dd24SShri Abhyankar 5042b2b2dd24SShri Abhyankar } 5043b2b2dd24SShri Abhyankar 50443649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5045b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5046b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5047b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 5048b2b2dd24SShri Abhyankar } 5049b2b2dd24SShri Abhyankar 5050b2b2dd24SShri Abhyankar #undef __FUNCT__ 505106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 505206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 50534e2b4712SSatish Balay { 50544e2b4712SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 50554e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 50566849ba73SBarry Smith PetscErrorCode ierr; 5057b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5058b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 50595d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5060d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 5061d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 5062d9fead3dSBarry Smith const PetscScalar *b; 50634e2b4712SSatish Balay 50644e2b4712SSatish Balay PetscFunctionBegin; 50653649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 50661ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5067f1af5d2fSBarry Smith t = a->solve_work; 50684e2b4712SSatish Balay 50694e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 50704e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 50714e2b4712SSatish Balay 50724e2b4712SSatish Balay /* forward solve the lower triangular */ 50734e2b4712SSatish Balay idx = 2*(*r++); 5074f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 50754e2b4712SSatish Balay for (i=1; i<n; i++) { 50764e2b4712SSatish Balay v = aa + 4*ai[i]; 50774e2b4712SSatish Balay vi = aj + ai[i]; 50784e2b4712SSatish Balay nz = diag[i] - ai[i]; 50794e2b4712SSatish Balay idx = 2*(*r++); 5080f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 50814e2b4712SSatish Balay while (nz--) { 50824e2b4712SSatish Balay idx = 2*(*vi++); 5083f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 5084f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5085f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 50864e2b4712SSatish Balay v += 4; 50874e2b4712SSatish Balay } 50884e2b4712SSatish Balay idx = 2*i; 5089f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 50904e2b4712SSatish Balay } 50914e2b4712SSatish Balay /* backward solve the upper triangular */ 50924e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 50934e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 50944e2b4712SSatish Balay vi = aj + diag[i] + 1; 50954e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 50964e2b4712SSatish Balay idt = 2*i; 5097f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 50984e2b4712SSatish Balay while (nz--) { 50994e2b4712SSatish Balay idx = 2*(*vi++); 5100f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 5101f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5102f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 51034e2b4712SSatish Balay v += 4; 51044e2b4712SSatish Balay } 51054e2b4712SSatish Balay idc = 2*(*c--); 51064e2b4712SSatish Balay v = aa + 4*diag[i]; 5107f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5108f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 51094e2b4712SSatish Balay } 51104e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 51114e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 51123649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 51131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5114dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 51154e2b4712SSatish Balay PetscFunctionReturn(0); 51164e2b4712SSatish Balay } 51174e2b4712SSatish Balay 51180c4413a7SShri Abhyankar #undef __FUNCT__ 51194dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2" 51204dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 51210c4413a7SShri Abhyankar { 51220c4413a7SShri Abhyankar Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 51230c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 51240c4413a7SShri Abhyankar PetscErrorCode ierr; 5125b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5126b3260449SShri Abhyankar PetscInt i,nz,idx,jdx,idt,idc,m; 51270c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 51280c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 51290c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 51300c4413a7SShri Abhyankar const PetscScalar *b; 51310c4413a7SShri Abhyankar 51320c4413a7SShri Abhyankar PetscFunctionBegin; 51333649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 51340c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 51350c4413a7SShri Abhyankar t = a->solve_work; 51360c4413a7SShri Abhyankar 51370c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 51380c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 51390c4413a7SShri Abhyankar 51400c4413a7SShri Abhyankar /* forward solve the lower triangular */ 51410c4413a7SShri Abhyankar idx = 2*r[0]; 51420c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 51430c4413a7SShri Abhyankar for (i=1; i<n; i++) { 51440c4413a7SShri Abhyankar v = aa + 4*ai[i]; 51450c4413a7SShri Abhyankar vi = aj + ai[i]; 51460c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 51470c4413a7SShri Abhyankar idx = 2*r[i]; 51480c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 51490c4413a7SShri Abhyankar for (m=0; m<nz; m++) { 51500c4413a7SShri Abhyankar jdx = 2*vi[m]; 51510c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 51520c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 51530c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 51540c4413a7SShri Abhyankar v += 4; 51550c4413a7SShri Abhyankar } 51560c4413a7SShri Abhyankar idx = 2*i; 51570c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 51580c4413a7SShri Abhyankar } 51590c4413a7SShri Abhyankar /* backward solve the upper triangular */ 51600c4413a7SShri Abhyankar for (i=n-1; i>=0; i--) { 51610c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 51620c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 51630c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 51640c4413a7SShri Abhyankar idt = 2*i; 51650c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 51660c4413a7SShri Abhyankar for (m=0; m<nz; m++) { 51670c4413a7SShri Abhyankar idx = 2*vi[m]; 51680c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 51690c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 51700c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 51710c4413a7SShri Abhyankar v += 4; 51720c4413a7SShri Abhyankar } 51730c4413a7SShri Abhyankar idc = 2*c[i]; 51740c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 51750c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 51760c4413a7SShri Abhyankar } 51770c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 51780c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 51793649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 51800c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 51810c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 51820c4413a7SShri Abhyankar PetscFunctionReturn(0); 51830c4413a7SShri Abhyankar } 51848f690400SShri Abhyankar 518515091d37SBarry Smith /* 518615091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 518715091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 518815091d37SBarry Smith */ 51894a2ae208SSatish Balay #undef __FUNCT__ 519006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 519106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 519215091d37SBarry Smith { 519315091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5194b3260449SShri Abhyankar const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5195dfbe8321SBarry Smith PetscErrorCode ierr; 5196d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 5197d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 5198d9fead3dSBarry Smith const PetscScalar *b; 5199b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i; 520015091d37SBarry Smith 520115091d37SBarry Smith PetscFunctionBegin; 52023649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 52031ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 520415091d37SBarry Smith 520515091d37SBarry Smith /* forward solve the lower triangular */ 520615091d37SBarry Smith idx = 0; 520715091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 520815091d37SBarry Smith for (i=1; i<n; i++) { 520915091d37SBarry Smith v = aa + 4*ai[i]; 521015091d37SBarry Smith vi = aj + ai[i]; 521115091d37SBarry Smith nz = diag[i] - ai[i]; 521215091d37SBarry Smith idx += 2; 5213f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 521415091d37SBarry Smith while (nz--) { 521515091d37SBarry Smith jdx = 2*(*vi++); 521615091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 5217f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5218f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 521915091d37SBarry Smith v += 4; 522015091d37SBarry Smith } 5221f1af5d2fSBarry Smith x[idx] = s1; 5222f1af5d2fSBarry Smith x[1+idx] = s2; 522315091d37SBarry Smith } 522415091d37SBarry Smith /* backward solve the upper triangular */ 522515091d37SBarry Smith for (i=n-1; i>=0; i--) { 522615091d37SBarry Smith v = aa + 4*diag[i] + 4; 522715091d37SBarry Smith vi = aj + diag[i] + 1; 522815091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 522915091d37SBarry Smith idt = 2*i; 5230f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 523115091d37SBarry Smith while (nz--) { 523215091d37SBarry Smith idx = 2*(*vi++); 523315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 5234f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5235f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 523615091d37SBarry Smith v += 4; 523715091d37SBarry Smith } 523815091d37SBarry Smith v = aa + 4*diag[i]; 5239f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 5240f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 524115091d37SBarry Smith } 524215091d37SBarry Smith 52433649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 52441ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5245dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 524615091d37SBarry Smith PetscFunctionReturn(0); 524715091d37SBarry Smith } 524815091d37SBarry Smith 5249cee9d6f2SShri Abhyankar #undef __FUNCT__ 52504dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 52514dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5252b2b2dd24SShri Abhyankar { 5253b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5254b3260449SShri Abhyankar const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5255b3260449SShri Abhyankar PetscInt i,k,nz,idx,idt,jdx; 5256b2b2dd24SShri Abhyankar PetscErrorCode ierr; 5257b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 5258b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 5259b2b2dd24SShri Abhyankar const PetscScalar *b; 5260b2b2dd24SShri Abhyankar 5261b2b2dd24SShri Abhyankar PetscFunctionBegin; 52623649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5263b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5264b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 5265b2b2dd24SShri Abhyankar idx = 0; 5266b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 5267b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 5268b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 5269b2b2dd24SShri Abhyankar vi = aj + ai[i]; 5270b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 5271b2b2dd24SShri Abhyankar idx = 2*i; 5272b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 52734c0dbd8dSJed Brown PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA); 52744c0dbd8dSJed Brown PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); 5275b2b2dd24SShri Abhyankar for (k=0; k<nz; k++) { 5276b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 5277b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 5278b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 5279b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 5280b2b2dd24SShri Abhyankar v += 4; 5281b2b2dd24SShri Abhyankar } 5282b2b2dd24SShri Abhyankar x[idx] = s1; 5283b2b2dd24SShri Abhyankar x[1+idx] = s2; 5284b2b2dd24SShri Abhyankar } 5285b2b2dd24SShri Abhyankar 5286b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 5287b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--) { 5288b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 5289b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 5290b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5291b2b2dd24SShri Abhyankar idt = 2*i; 5292b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 52934c0dbd8dSJed Brown PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA); 52944c0dbd8dSJed Brown PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); 5295b2b2dd24SShri Abhyankar for (k=0; k<nz; k++) { 5296b2b2dd24SShri Abhyankar idx = 2*vi[k]; 5297b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 5298b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 5299b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 5300b2b2dd24SShri Abhyankar v += 4; 5301b2b2dd24SShri Abhyankar } 5302b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 5303b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 5304b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 5305b2b2dd24SShri Abhyankar } 5306b2b2dd24SShri Abhyankar 53073649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5308b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5309b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5310b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 5311b2b2dd24SShri Abhyankar } 5312b2b2dd24SShri Abhyankar 5313b2b2dd24SShri Abhyankar #undef __FUNCT__ 531406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 531506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 53164e2b4712SSatish Balay { 53174e2b4712SSatish Balay Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data; 53184e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 53196849ba73SBarry Smith PetscErrorCode ierr; 5320b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5321b3260449SShri Abhyankar PetscInt i,nz; 53225d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5323b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 5324b3260449SShri Abhyankar PetscScalar *x,s1,*t; 5325b3260449SShri Abhyankar const PetscScalar *b; 53264e2b4712SSatish Balay 53274e2b4712SSatish Balay PetscFunctionBegin; 53284e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 53294e2b4712SSatish Balay 53303649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 53311ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5332f1af5d2fSBarry Smith t = a->solve_work; 53334e2b4712SSatish Balay 53344e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 53354e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 53364e2b4712SSatish Balay 53374e2b4712SSatish Balay /* forward solve the lower triangular */ 5338f1af5d2fSBarry Smith t[0] = b[*r++]; 53394e2b4712SSatish Balay for (i=1; i<n; i++) { 53404e2b4712SSatish Balay v = aa + ai[i]; 53414e2b4712SSatish Balay vi = aj + ai[i]; 53424e2b4712SSatish Balay nz = diag[i] - ai[i]; 5343f1af5d2fSBarry Smith s1 = b[*r++]; 53444e2b4712SSatish Balay while (nz--) { 5345f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 53464e2b4712SSatish Balay } 5347f1af5d2fSBarry Smith t[i] = s1; 53484e2b4712SSatish Balay } 53494e2b4712SSatish Balay /* backward solve the upper triangular */ 53504e2b4712SSatish Balay for (i=n-1; i>=0; i--) { 53514e2b4712SSatish Balay v = aa + diag[i] + 1; 53524e2b4712SSatish Balay vi = aj + diag[i] + 1; 53534e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 5354f1af5d2fSBarry Smith s1 = t[i]; 53554e2b4712SSatish Balay while (nz--) { 5356f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 53574e2b4712SSatish Balay } 5358f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 53594e2b4712SSatish Balay } 53604e2b4712SSatish Balay 53614e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 53624e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 53633649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 53641ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5365dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 53664e2b4712SSatish Balay PetscFunctionReturn(0); 53674e2b4712SSatish Balay } 5368048b5e81SShri Abhyankar 5369048b5e81SShri Abhyankar #undef __FUNCT__ 5370048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1" 5371048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 5372048b5e81SShri Abhyankar { 5373048b5e81SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5374048b5e81SShri Abhyankar IS iscol = a->col,isrow = a->row; 5375048b5e81SShri Abhyankar PetscErrorCode ierr; 5376048b5e81SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 5377048b5e81SShri Abhyankar const PetscInt *rout,*cout,*r,*c; 5378048b5e81SShri Abhyankar PetscScalar *x,*tmp,sum; 5379048b5e81SShri Abhyankar const PetscScalar *b; 5380048b5e81SShri Abhyankar const MatScalar *aa = a->a,*v; 5381048b5e81SShri Abhyankar 5382048b5e81SShri Abhyankar PetscFunctionBegin; 5383048b5e81SShri Abhyankar if (!n) PetscFunctionReturn(0); 5384048b5e81SShri Abhyankar 53853649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5386048b5e81SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5387048b5e81SShri Abhyankar tmp = a->solve_work; 5388048b5e81SShri Abhyankar 5389048b5e81SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5390048b5e81SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5391048b5e81SShri Abhyankar 5392048b5e81SShri Abhyankar /* forward solve the lower triangular */ 5393048b5e81SShri Abhyankar tmp[0] = b[r[0]]; 5394048b5e81SShri Abhyankar v = aa; 5395048b5e81SShri Abhyankar vi = aj; 5396048b5e81SShri Abhyankar for (i=1; i<n; i++) { 5397048b5e81SShri Abhyankar nz = ai[i+1] - ai[i]; 5398048b5e81SShri Abhyankar sum = b[r[i]]; 5399048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5400048b5e81SShri Abhyankar tmp[i] = sum; 5401048b5e81SShri Abhyankar v += nz; vi += nz; 5402048b5e81SShri Abhyankar } 5403048b5e81SShri Abhyankar 5404048b5e81SShri Abhyankar /* backward solve the upper triangular */ 5405048b5e81SShri Abhyankar for (i=n-1; i>=0; i--) { 5406048b5e81SShri Abhyankar v = aa + adiag[i+1]+1; 5407048b5e81SShri Abhyankar vi = aj + adiag[i+1]+1; 5408048b5e81SShri Abhyankar nz = adiag[i]-adiag[i+1]-1; 5409048b5e81SShri Abhyankar sum = tmp[i]; 5410048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5411048b5e81SShri Abhyankar x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 5412048b5e81SShri Abhyankar } 5413048b5e81SShri Abhyankar 5414048b5e81SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5415048b5e81SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 54163649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5417048b5e81SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5418048b5e81SShri Abhyankar ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 5419048b5e81SShri Abhyankar PetscFunctionReturn(0); 5420048b5e81SShri Abhyankar } 5421048b5e81SShri Abhyankar 542215091d37SBarry Smith /* 542315091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 542415091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 542515091d37SBarry Smith */ 54264a2ae208SSatish Balay #undef __FUNCT__ 542706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 542806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 542915091d37SBarry Smith { 543015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5431b3260449SShri Abhyankar const PetscInt n = a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5432dfbe8321SBarry Smith PetscErrorCode ierr; 5433b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 5434b3260449SShri Abhyankar PetscScalar *x; 5435b3260449SShri Abhyankar const PetscScalar *b; 543687828ca2SBarry Smith PetscScalar s1,x1; 5437b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i; 543815091d37SBarry Smith 543915091d37SBarry Smith PetscFunctionBegin; 54403649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 54411ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 544215091d37SBarry Smith 544315091d37SBarry Smith /* forward solve the lower triangular */ 544415091d37SBarry Smith idx = 0; 544515091d37SBarry Smith x[0] = b[0]; 544615091d37SBarry Smith for (i=1; i<n; i++) { 544715091d37SBarry Smith v = aa + ai[i]; 544815091d37SBarry Smith vi = aj + ai[i]; 544915091d37SBarry Smith nz = diag[i] - ai[i]; 545015091d37SBarry Smith idx += 1; 5451f1af5d2fSBarry Smith s1 = b[idx]; 545215091d37SBarry Smith while (nz--) { 545315091d37SBarry Smith jdx = *vi++; 545415091d37SBarry Smith x1 = x[jdx]; 5455f1af5d2fSBarry Smith s1 -= v[0]*x1; 545615091d37SBarry Smith v += 1; 545715091d37SBarry Smith } 5458f1af5d2fSBarry Smith x[idx] = s1; 545915091d37SBarry Smith } 546015091d37SBarry Smith /* backward solve the upper triangular */ 546115091d37SBarry Smith for (i=n-1; i>=0; i--) { 546215091d37SBarry Smith v = aa + diag[i] + 1; 546315091d37SBarry Smith vi = aj + diag[i] + 1; 546415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 546515091d37SBarry Smith idt = i; 5466f1af5d2fSBarry Smith s1 = x[idt]; 546715091d37SBarry Smith while (nz--) { 546815091d37SBarry Smith idx = *vi++; 546915091d37SBarry Smith x1 = x[idx]; 5470f1af5d2fSBarry Smith s1 -= v[0]*x1; 547115091d37SBarry Smith v += 1; 547215091d37SBarry Smith } 547315091d37SBarry Smith v = aa + diag[i]; 5474f1af5d2fSBarry Smith x[idt] = v[0]*s1; 547515091d37SBarry Smith } 54763649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 54771ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5478dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 547915091d37SBarry Smith PetscFunctionReturn(0); 548015091d37SBarry Smith } 54814e2b4712SSatish Balay 5482048b5e81SShri Abhyankar 5483048b5e81SShri Abhyankar #undef __FUNCT__ 5484048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5485048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 5486048b5e81SShri Abhyankar { 5487048b5e81SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5488048b5e81SShri Abhyankar PetscErrorCode ierr; 5489048b5e81SShri Abhyankar const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi; 5490048b5e81SShri Abhyankar PetscScalar *x,sum; 5491048b5e81SShri Abhyankar const PetscScalar *b; 5492048b5e81SShri Abhyankar const MatScalar *aa = a->a,*v; 5493048b5e81SShri Abhyankar PetscInt i,nz; 5494048b5e81SShri Abhyankar 5495048b5e81SShri Abhyankar PetscFunctionBegin; 5496048b5e81SShri Abhyankar if (!n) PetscFunctionReturn(0); 5497048b5e81SShri Abhyankar 54983649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5499048b5e81SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5500048b5e81SShri Abhyankar 5501048b5e81SShri Abhyankar /* forward solve the lower triangular */ 5502048b5e81SShri Abhyankar x[0] = b[0]; 5503048b5e81SShri Abhyankar v = aa; 5504048b5e81SShri Abhyankar vi = aj; 5505048b5e81SShri Abhyankar for (i=1; i<n; i++) { 5506048b5e81SShri Abhyankar nz = ai[i+1] - ai[i]; 5507048b5e81SShri Abhyankar sum = b[i]; 5508048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5509048b5e81SShri Abhyankar v += nz; 5510048b5e81SShri Abhyankar vi += nz; 5511048b5e81SShri Abhyankar x[i] = sum; 5512048b5e81SShri Abhyankar } 5513048b5e81SShri Abhyankar 5514048b5e81SShri Abhyankar /* backward solve the upper triangular */ 5515048b5e81SShri Abhyankar for (i=n-1; i>=0; i--) { 5516048b5e81SShri Abhyankar v = aa + adiag[i+1] + 1; 5517048b5e81SShri Abhyankar vi = aj + adiag[i+1] + 1; 5518048b5e81SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5519048b5e81SShri Abhyankar sum = x[i]; 5520048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5521048b5e81SShri Abhyankar x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */ 5522048b5e81SShri Abhyankar } 5523048b5e81SShri Abhyankar 5524048b5e81SShri Abhyankar ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 55253649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5526048b5e81SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5527048b5e81SShri Abhyankar PetscFunctionReturn(0); 5528048b5e81SShri Abhyankar } 5529048b5e81SShri Abhyankar 55304e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 553109573ac7SBarry Smith extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool); 55326bce7ff8SHong Zhang 55332b0b2ea7SShri Abhyankar #undef __FUNCT__ 553429a97285SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering" 5535766f9fbaSBarry Smith /* 5536766f9fbaSBarry Smith This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 5537766f9fbaSBarry Smith */ 553829a97285SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 55392b0b2ea7SShri Abhyankar { 55402b0b2ea7SShri Abhyankar Mat C =B; 55412b0b2ea7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 55422b0b2ea7SShri Abhyankar PetscErrorCode ierr; 5543766f9fbaSBarry Smith PetscInt i,j,k,ipvt[15]; 5544766f9fbaSBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj; 5545766f9fbaSBarry Smith PetscInt nz,nzL,row; 5546766f9fbaSBarry Smith MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225]; 5547766f9fbaSBarry Smith const MatScalar *v,*aa=a->a; 55482b0b2ea7SShri Abhyankar PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 55490fa040f9SShri Abhyankar PetscInt sol_ver; 55502b0b2ea7SShri Abhyankar 55512b0b2ea7SShri Abhyankar PetscFunctionBegin; 5552c55dd799SBarry Smith ierr = PetscOptionsGetInt(((PetscObject)A)->prefix,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr); 55530fa040f9SShri Abhyankar 55542b0b2ea7SShri Abhyankar /* generate work space needed by the factorization */ 55552b0b2ea7SShri Abhyankar ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 55562b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 55572b0b2ea7SShri Abhyankar 55582b0b2ea7SShri Abhyankar for (i=0; i<n; i++) { 55592b0b2ea7SShri Abhyankar /* zero rtmp */ 55602b0b2ea7SShri Abhyankar /* L part */ 55612b0b2ea7SShri Abhyankar nz = bi[i+1] - bi[i]; 55622b0b2ea7SShri Abhyankar bjtmp = bj + bi[i]; 55632b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 55642b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 55652b0b2ea7SShri Abhyankar } 55662b0b2ea7SShri Abhyankar 55672b0b2ea7SShri Abhyankar /* U part */ 55682b0b2ea7SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 55692b0b2ea7SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 55702b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 55712b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 55722b0b2ea7SShri Abhyankar } 55732b0b2ea7SShri Abhyankar 55742b0b2ea7SShri Abhyankar /* load in initial (unfactored row) */ 557529a97285SShri Abhyankar nz = ai[i+1] - ai[i]; 557629a97285SShri Abhyankar ajtmp = aj + ai[i]; 557729a97285SShri Abhyankar v = aa + bs2*ai[i]; 55782b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 557929a97285SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 55802b0b2ea7SShri Abhyankar } 55812b0b2ea7SShri Abhyankar 55822b0b2ea7SShri Abhyankar /* elimination */ 55832b0b2ea7SShri Abhyankar bjtmp = bj + bi[i]; 55842b0b2ea7SShri Abhyankar nzL = bi[i+1] - bi[i]; 55852b0b2ea7SShri Abhyankar for (k=0; k < nzL; k++) { 55862b0b2ea7SShri Abhyankar row = bjtmp[k]; 55872b0b2ea7SShri Abhyankar pc = rtmp + bs2*row; 5588c35f09e5SBarry Smith for (flg=0,j=0; j<bs2; j++) { 5589c35f09e5SBarry Smith if (pc[j]!=0.0) { 5590c35f09e5SBarry Smith flg = 1; 5591c35f09e5SBarry Smith break; 5592c35f09e5SBarry Smith } 5593c35f09e5SBarry Smith } 55942b0b2ea7SShri Abhyankar if (flg) { 55952b0b2ea7SShri Abhyankar pv = b->a + bs2*bdiag[row]; 559696b95a6bSBarry Smith PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); 559796b95a6bSBarry Smith /*ierr = PetscKernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/ 55982b0b2ea7SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 55992b0b2ea7SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 56002b0b2ea7SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 56012b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 5602766f9fbaSBarry Smith vv = rtmp + bs2*pj[j]; 560396b95a6bSBarry Smith PetscKernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv); 560496b95a6bSBarry Smith /* ierr = PetscKernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */ 56052b0b2ea7SShri Abhyankar pv += bs2; 56062b0b2ea7SShri Abhyankar } 5607766f9fbaSBarry Smith ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 56082b0b2ea7SShri Abhyankar } 56092b0b2ea7SShri Abhyankar } 56102b0b2ea7SShri Abhyankar 56112b0b2ea7SShri Abhyankar /* finished row so stick it into b->a */ 56122b0b2ea7SShri Abhyankar /* L part */ 56132b0b2ea7SShri Abhyankar pv = b->a + bs2*bi[i]; 56142b0b2ea7SShri Abhyankar pj = b->j + bi[i]; 56152b0b2ea7SShri Abhyankar nz = bi[i+1] - bi[i]; 56162b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 56172b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56182b0b2ea7SShri Abhyankar } 56192b0b2ea7SShri Abhyankar 56202b0b2ea7SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 56212b0b2ea7SShri Abhyankar pv = b->a + bs2*bdiag[i]; 56222b0b2ea7SShri Abhyankar pj = b->j + bdiag[i]; 56232b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 562496b95a6bSBarry Smith /* PetscKernel_A_gets_inverse_A(bs,pv,pivots,work); */ 562596b95a6bSBarry Smith ierr = PetscKernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr); 56262b0b2ea7SShri Abhyankar 56272b0b2ea7SShri Abhyankar /* U part */ 56282b0b2ea7SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 56292b0b2ea7SShri Abhyankar pj = b->j + bdiag[i+1]+1; 56302b0b2ea7SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 56312b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 56322b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56332b0b2ea7SShri Abhyankar } 56342b0b2ea7SShri Abhyankar } 56352b0b2ea7SShri Abhyankar 56362b0b2ea7SShri Abhyankar ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5637*26fbe8dcSKarl Rupp 5638832cc040SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 5639766f9fbaSBarry Smith C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 56402b0b2ea7SShri Abhyankar C->assembled = PETSC_TRUE; 5641*26fbe8dcSKarl Rupp 5642766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 56432b0b2ea7SShri Abhyankar PetscFunctionReturn(0); 56442b0b2ea7SShri Abhyankar } 56452b0b2ea7SShri Abhyankar 56466bce7ff8SHong Zhang #undef __FUNCT__ 56474dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 56484dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 56496bce7ff8SHong Zhang { 56506bce7ff8SHong Zhang Mat C =B; 56516bce7ff8SHong Zhang Mat_SeqBAIJ *a =(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ*)C->data; 56526bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 56536bce7ff8SHong Zhang PetscErrorCode ierr; 56545a586d82SBarry Smith const PetscInt *r,*ic; 56556bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 56566bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5657b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5658914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5659914a18a2SHong Zhang MatScalar *v_work; 5660ace3abfcSBarry Smith PetscBool col_identity,row_identity,both_identity; 56616bce7ff8SHong Zhang 56626bce7ff8SHong Zhang PetscFunctionBegin; 56636bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 56646bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5665ae3d28f0SHong Zhang 5666fca92195SBarry Smith ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5667fca92195SBarry Smith ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 56686bce7ff8SHong Zhang 5669914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 5670fca92195SBarry Smith ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5671914a18a2SHong Zhang 56726bce7ff8SHong Zhang for (i=0; i<n; i++) { 56736bce7ff8SHong Zhang /* zero rtmp */ 56746bce7ff8SHong Zhang /* L part */ 56756bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 56766bce7ff8SHong Zhang bjtmp = bj + bi[i]; 5677914a18a2SHong Zhang for (j=0; j<nz; j++) { 5678914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5679914a18a2SHong Zhang } 56806bce7ff8SHong Zhang 56816bce7ff8SHong Zhang /* U part */ 56821a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 56831a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 56841a83e813SShri Abhyankar for (j=0; j<nz; j++) { 56851a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56861a83e813SShri Abhyankar } 56871a83e813SShri Abhyankar 56881a83e813SShri Abhyankar /* load in initial (unfactored row) */ 56891a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 56901a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 56911a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 56921a83e813SShri Abhyankar for (j=0; j<nz; j++) { 56931a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 56941a83e813SShri Abhyankar } 56951a83e813SShri Abhyankar 56961a83e813SShri Abhyankar /* elimination */ 56971a83e813SShri Abhyankar bjtmp = bj + bi[i]; 56981a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 56991a83e813SShri Abhyankar for (k=0; k < nzL; k++) { 57001a83e813SShri Abhyankar row = bjtmp[k]; 57011a83e813SShri Abhyankar pc = rtmp + bs2*row; 5702c35f09e5SBarry Smith for (flg=0,j=0; j<bs2; j++) { 5703c35f09e5SBarry Smith if (pc[j]!=0.0) { 5704c35f09e5SBarry Smith flg = 1; 5705c35f09e5SBarry Smith break; 5706c35f09e5SBarry Smith } 5707c35f09e5SBarry Smith } 57081a83e813SShri Abhyankar if (flg) { 57091a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 571096b95a6bSBarry Smith PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 57111a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 57121a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 57131a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 57141a83e813SShri Abhyankar for (j=0; j<nz; j++) { 571596b95a6bSBarry Smith PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 57161a83e813SShri Abhyankar } 57171a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 57181a83e813SShri Abhyankar } 57191a83e813SShri Abhyankar } 57201a83e813SShri Abhyankar 57211a83e813SShri Abhyankar /* finished row so stick it into b->a */ 57221a83e813SShri Abhyankar /* L part */ 57231a83e813SShri Abhyankar pv = b->a + bs2*bi[i]; 57241a83e813SShri Abhyankar pj = b->j + bi[i]; 57251a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 57261a83e813SShri Abhyankar for (j=0; j<nz; j++) { 57271a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 57281a83e813SShri Abhyankar } 57291a83e813SShri Abhyankar 57301a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 57311a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 57321a83e813SShri Abhyankar pj = b->j + bdiag[i]; 5733e32f2f54SBarry Smith /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 57341a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 573596b95a6bSBarry Smith ierr = PetscKernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 57361a83e813SShri Abhyankar 57371a83e813SShri Abhyankar /* U part */ 57381a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 57391a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 57401a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 57411a83e813SShri Abhyankar for (j=0; j<nz; j++) { 57421a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 57431a83e813SShri Abhyankar } 57441a83e813SShri Abhyankar } 57451a83e813SShri Abhyankar 57461a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 5747fca92195SBarry Smith ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 57481a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 57491a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 57501a83e813SShri Abhyankar 5751ae3d28f0SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5752ae3d28f0SHong Zhang ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5753*26fbe8dcSKarl Rupp 5754ace3abfcSBarry Smith both_identity = (PetscBool) (row_identity && col_identity); 5755ae3d28f0SHong Zhang if (both_identity) { 57564dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5757ae3d28f0SHong Zhang } else { 57584dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N; 5759ae3d28f0SHong Zhang } 57604dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5761ae3d28f0SHong Zhang 57621a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 5763*26fbe8dcSKarl Rupp 5764766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 57651a83e813SShri Abhyankar PetscFunctionReturn(0); 57661a83e813SShri Abhyankar } 57671a83e813SShri Abhyankar 57686bce7ff8SHong Zhang /* 57696bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 57704dd39f65SShri Abhyankar See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 57714dd39f65SShri Abhyankar because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 57726bce7ff8SHong Zhang */ 5773c0c7eb62SShri Abhyankar 57746bce7ff8SHong Zhang #undef __FUNCT__ 57754dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 57764dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 57776bce7ff8SHong Zhang { 57786bce7ff8SHong Zhang 57796bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 57806bce7ff8SHong Zhang PetscErrorCode ierr; 578116a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 578235aa4fcfSShri Abhyankar PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 578335aa4fcfSShri Abhyankar 578435aa4fcfSShri Abhyankar PetscFunctionBegin; 578535aa4fcfSShri Abhyankar ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 578635aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 578735aa4fcfSShri Abhyankar 578835aa4fcfSShri Abhyankar /* allocate matrix arrays for new data structure */ 578935aa4fcfSShri Abhyankar ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 579035aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5791*26fbe8dcSKarl Rupp 579235aa4fcfSShri Abhyankar b->singlemalloc = PETSC_TRUE; 5793379be0ddSLisandro Dalcin b->free_a = PETSC_TRUE; 5794379be0ddSLisandro Dalcin b->free_ij = PETSC_TRUE; 57951e40a84eSLisandro Dalcin fact->preallocated = PETSC_TRUE; 57961e40a84eSLisandro Dalcin fact->assembled = PETSC_TRUE; 579735aa4fcfSShri Abhyankar if (!b->diag) { 579835aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 579935aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 580035aa4fcfSShri Abhyankar } 580135aa4fcfSShri Abhyankar bdiag = b->diag; 580235aa4fcfSShri Abhyankar 580335aa4fcfSShri Abhyankar if (n > 0) { 580435aa4fcfSShri Abhyankar ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 580535aa4fcfSShri Abhyankar } 580635aa4fcfSShri Abhyankar 580735aa4fcfSShri Abhyankar /* set bi and bj with new data structure */ 580835aa4fcfSShri Abhyankar bi = b->i; 580935aa4fcfSShri Abhyankar bj = b->j; 581035aa4fcfSShri Abhyankar 581135aa4fcfSShri Abhyankar /* L part */ 581235aa4fcfSShri Abhyankar bi[0] = 0; 581335aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 581435aa4fcfSShri Abhyankar nz = adiag[i] - ai[i]; 581535aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nz; 581635aa4fcfSShri Abhyankar aj = a->j + ai[i]; 581735aa4fcfSShri Abhyankar for (j=0; j<nz; j++) { 581835aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 581935aa4fcfSShri Abhyankar } 582035aa4fcfSShri Abhyankar } 582135aa4fcfSShri Abhyankar 582235aa4fcfSShri Abhyankar /* U part */ 582335aa4fcfSShri Abhyankar bi_temp = bi[n]; 582435aa4fcfSShri Abhyankar bdiag[n] = bi[n]-1; 582535aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--) { 582635aa4fcfSShri Abhyankar nz = ai[i+1] - adiag[i] - 1; 582735aa4fcfSShri Abhyankar bi_temp = bi_temp + nz + 1; 582835aa4fcfSShri Abhyankar aj = a->j + adiag[i] + 1; 582935aa4fcfSShri Abhyankar for (j=0; j<nz; j++) { 583035aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 583135aa4fcfSShri Abhyankar } 583235aa4fcfSShri Abhyankar /* diag[i] */ 583335aa4fcfSShri Abhyankar *bj = i; bj++; 583435aa4fcfSShri Abhyankar bdiag[i] = bi_temp - 1; 583535aa4fcfSShri Abhyankar } 583635aa4fcfSShri Abhyankar PetscFunctionReturn(0); 583735aa4fcfSShri Abhyankar } 583835aa4fcfSShri Abhyankar 583935aa4fcfSShri Abhyankar #undef __FUNCT__ 58404dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 58414dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 584216a2bf60SHong Zhang { 584316a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 584416a2bf60SHong Zhang IS isicol; 584516a2bf60SHong Zhang PetscErrorCode ierr; 584616a2bf60SHong Zhang const PetscInt *r,*ic; 58477fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 584816a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 584916a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 585016a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 5851ace3abfcSBarry Smith PetscBool col_identity,row_identity,both_identity; 585216a2bf60SHong Zhang PetscReal f; 585316a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 585416a2bf60SHong Zhang PetscBT lnkbt; 585516a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 585616a2bf60SHong Zhang PetscFreeSpaceList free_space =PETSC_NULL,current_space=PETSC_NULL; 585716a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5858ace3abfcSBarry Smith PetscBool missing; 58597fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 586016a2bf60SHong Zhang 586116a2bf60SHong Zhang PetscFunctionBegin; 5862e32f2f54SBarry Smith if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 58636ba06ab7SHong Zhang if (bs>1) { /* check shifttype */ 58646ba06ab7SHong Zhang if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) 58656ba06ab7SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 58666ba06ab7SHong Zhang } 58676ba06ab7SHong Zhang 586816a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5869e32f2f54SBarry Smith if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 587016a2bf60SHong Zhang 587116a2bf60SHong Zhang f = info->fill; 587216a2bf60SHong Zhang levels = (PetscInt)info->levels; 587316a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 5874*26fbe8dcSKarl Rupp 587516a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 587616a2bf60SHong Zhang 587716a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 587816a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5879*26fbe8dcSKarl Rupp 5880ace3abfcSBarry Smith both_identity = (PetscBool) (row_identity && col_identity); 588116a2bf60SHong Zhang 58827fa3a6a0SHong Zhang if (!levels && both_identity) { 588316a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 58844dd39f65SShri Abhyankar ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 58854dd39f65SShri Abhyankar ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 588635aa4fcfSShri Abhyankar 5887d5f3da31SBarry Smith fact->factortype = MAT_FACTOR_ILU; 588835aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = 0; 588935aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = info->fill; 589035aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = 1.0; 5891*26fbe8dcSKarl Rupp 589235aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 589335aa4fcfSShri Abhyankar b->row = isrow; 589435aa4fcfSShri Abhyankar b->col = iscol; 589535aa4fcfSShri Abhyankar b->icol = isicol; 589635aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 589735aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 589835aa4fcfSShri Abhyankar b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5899*26fbe8dcSKarl Rupp 590035aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 590135aa4fcfSShri Abhyankar PetscFunctionReturn(0); 590235aa4fcfSShri Abhyankar } 590335aa4fcfSShri Abhyankar 590435aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 590535aa4fcfSShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 590635aa4fcfSShri Abhyankar 590735aa4fcfSShri Abhyankar /* get new row pointers */ 590835aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 590935aa4fcfSShri Abhyankar bi[0] = 0; 591035aa4fcfSShri Abhyankar /* bdiag is location of diagonal in factor */ 591135aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 591235aa4fcfSShri Abhyankar bdiag[0] = 0; 591335aa4fcfSShri Abhyankar 5914fca92195SBarry Smith ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 591535aa4fcfSShri Abhyankar 591635aa4fcfSShri Abhyankar /* create a linked list for storing column indices of the active row */ 591735aa4fcfSShri Abhyankar nlnk = n + 1; 591835aa4fcfSShri Abhyankar ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 591935aa4fcfSShri Abhyankar 592035aa4fcfSShri Abhyankar /* initial FreeSpace size is f*(ai[n]+1) */ 592135aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 592235aa4fcfSShri Abhyankar current_space = free_space; 592335aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 592435aa4fcfSShri Abhyankar current_space_lvl = free_space_lvl; 592535aa4fcfSShri Abhyankar 592635aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 592735aa4fcfSShri Abhyankar nzi = 0; 592835aa4fcfSShri Abhyankar /* copy current row into linked list */ 592935aa4fcfSShri Abhyankar nnz = ai[r[i]+1] - ai[r[i]]; 5930e32f2f54SBarry Smith if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 593135aa4fcfSShri Abhyankar cols = aj + ai[r[i]]; 593235aa4fcfSShri Abhyankar lnk[i] = -1; /* marker to indicate if diagonal exists */ 593335aa4fcfSShri Abhyankar ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 593435aa4fcfSShri Abhyankar nzi += nlnk; 593535aa4fcfSShri Abhyankar 593635aa4fcfSShri Abhyankar /* make sure diagonal entry is included */ 593735aa4fcfSShri Abhyankar if (diagonal_fill && lnk[i] == -1) { 593835aa4fcfSShri Abhyankar fm = n; 593935aa4fcfSShri Abhyankar while (lnk[fm] < i) fm = lnk[fm]; 594035aa4fcfSShri Abhyankar lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 594135aa4fcfSShri Abhyankar lnk[fm] = i; 594235aa4fcfSShri Abhyankar lnk_lvl[i] = 0; 594335aa4fcfSShri Abhyankar nzi++; dcount++; 594435aa4fcfSShri Abhyankar } 594535aa4fcfSShri Abhyankar 594635aa4fcfSShri Abhyankar /* add pivot rows into the active row */ 594735aa4fcfSShri Abhyankar nzbd = 0; 594835aa4fcfSShri Abhyankar prow = lnk[n]; 594935aa4fcfSShri Abhyankar while (prow < i) { 595035aa4fcfSShri Abhyankar nnz = bdiag[prow]; 595135aa4fcfSShri Abhyankar cols = bj_ptr[prow] + nnz + 1; 595235aa4fcfSShri Abhyankar cols_lvl = bjlvl_ptr[prow] + nnz + 1; 595335aa4fcfSShri Abhyankar nnz = bi[prow+1] - bi[prow] - nnz - 1; 5954*26fbe8dcSKarl Rupp 595535aa4fcfSShri Abhyankar ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 595635aa4fcfSShri Abhyankar nzi += nlnk; 595735aa4fcfSShri Abhyankar prow = lnk[prow]; 595835aa4fcfSShri Abhyankar nzbd++; 595935aa4fcfSShri Abhyankar } 596035aa4fcfSShri Abhyankar bdiag[i] = nzbd; 596135aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nzi; 596235aa4fcfSShri Abhyankar 596335aa4fcfSShri Abhyankar /* if free space is not available, make more free space */ 596435aa4fcfSShri Abhyankar if (current_space->local_remaining<nzi) { 596535aa4fcfSShri Abhyankar nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 596635aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 596735aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 596835aa4fcfSShri Abhyankar reallocs++; 596935aa4fcfSShri Abhyankar } 597035aa4fcfSShri Abhyankar 597135aa4fcfSShri Abhyankar /* copy data into free_space and free_space_lvl, then initialize lnk */ 597235aa4fcfSShri Abhyankar ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5973*26fbe8dcSKarl Rupp 597435aa4fcfSShri Abhyankar bj_ptr[i] = current_space->array; 597535aa4fcfSShri Abhyankar bjlvl_ptr[i] = current_space_lvl->array; 597635aa4fcfSShri Abhyankar 597735aa4fcfSShri Abhyankar /* make sure the active row i has diagonal entry */ 597865e19b50SBarry Smith if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 597935aa4fcfSShri Abhyankar 598035aa4fcfSShri Abhyankar current_space->array += nzi; 598135aa4fcfSShri Abhyankar current_space->local_used += nzi; 598235aa4fcfSShri Abhyankar current_space->local_remaining -= nzi; 5983*26fbe8dcSKarl Rupp 598435aa4fcfSShri Abhyankar current_space_lvl->array += nzi; 598535aa4fcfSShri Abhyankar current_space_lvl->local_used += nzi; 598635aa4fcfSShri Abhyankar current_space_lvl->local_remaining -= nzi; 598735aa4fcfSShri Abhyankar } 598835aa4fcfSShri Abhyankar 598935aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 599035aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 599135aa4fcfSShri Abhyankar 599235aa4fcfSShri Abhyankar /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 59939263d837SHong Zhang ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 59942ce24eb6SHong Zhang ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 599535aa4fcfSShri Abhyankar 599635aa4fcfSShri Abhyankar ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 599735aa4fcfSShri Abhyankar ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5998fca92195SBarry Smith ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 599935aa4fcfSShri Abhyankar 600035aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO) 600135aa4fcfSShri Abhyankar { 6002aef85c9fSShri Abhyankar PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 600335aa4fcfSShri Abhyankar ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 600435aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 600535aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 600635aa4fcfSShri Abhyankar ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 600735aa4fcfSShri Abhyankar if (diagonal_fill) { 600835aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 600935aa4fcfSShri Abhyankar } 601035aa4fcfSShri Abhyankar } 601135aa4fcfSShri Abhyankar #endif 601235aa4fcfSShri Abhyankar 601335aa4fcfSShri Abhyankar /* put together the new matrix */ 601435aa4fcfSShri Abhyankar ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 601535aa4fcfSShri Abhyankar ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6016*26fbe8dcSKarl Rupp 601735aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 601835aa4fcfSShri Abhyankar b->free_a = PETSC_TRUE; 601935aa4fcfSShri Abhyankar b->free_ij = PETSC_TRUE; 602035aa4fcfSShri Abhyankar b->singlemalloc = PETSC_FALSE; 6021*26fbe8dcSKarl Rupp 602235aa4fcfSShri Abhyankar ierr = PetscMalloc((bs2*(bdiag[0]+1))*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 6023*26fbe8dcSKarl Rupp 602435aa4fcfSShri Abhyankar b->j = bj; 602535aa4fcfSShri Abhyankar b->i = bi; 602635aa4fcfSShri Abhyankar b->diag = bdiag; 602735aa4fcfSShri Abhyankar b->free_diag = PETSC_TRUE; 602835aa4fcfSShri Abhyankar b->ilen = 0; 602935aa4fcfSShri Abhyankar b->imax = 0; 603035aa4fcfSShri Abhyankar b->row = isrow; 603135aa4fcfSShri Abhyankar b->col = iscol; 603235aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 603335aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 603435aa4fcfSShri Abhyankar b->icol = isicol; 6035*26fbe8dcSKarl Rupp 603635aa4fcfSShri Abhyankar ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 603735aa4fcfSShri Abhyankar /* In b structure: Free imax, ilen, old a, old j. 603835aa4fcfSShri Abhyankar Allocate bdiag, solve_work, new a, new j */ 603935aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 604035aa4fcfSShri Abhyankar b->maxnz = b->nz = bdiag[0]+1; 6041*26fbe8dcSKarl Rupp 6042ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocs; 6043ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 6044ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 6045*26fbe8dcSKarl Rupp 60464dd39f65SShri Abhyankar ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 604735aa4fcfSShri Abhyankar PetscFunctionReturn(0); 604835aa4fcfSShri Abhyankar } 604935aa4fcfSShri Abhyankar 60504e2b4712SSatish Balay /* 60514e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 60524e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 60534e2b4712SSatish Balay Not a good example of code reuse. 60544e2b4712SSatish Balay */ 60554a2ae208SSatish Balay #undef __FUNCT__ 605606e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 605706e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 60584e2b4712SSatish Balay { 60594e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 60604e2b4712SSatish Balay IS isicol; 60616849ba73SBarry Smith PetscErrorCode ierr; 60625d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 60635d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 6064a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 6065d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 6066ace3abfcSBarry Smith PetscBool col_identity,row_identity,both_identity,flg; 6067329f5518SBarry Smith PetscReal f; 60684e2b4712SSatish Balay 60694e2b4712SSatish Balay PetscFunctionBegin; 60706bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 6071e32f2f54SBarry Smith if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 60726bce7ff8SHong Zhang 6073435faa5fSBarry Smith f = info->fill; 6074690b6cddSBarry Smith levels = (PetscInt)info->levels; 6075690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 6076*26fbe8dcSKarl Rupp 60774c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 607816a2bf60SHong Zhang 6079667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 6080667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 6081ace3abfcSBarry Smith both_identity = (PetscBool) (row_identity && col_identity); 6082309c388cSBarry Smith 608341df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 608416a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 60858b1456e3SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 60866bce7ff8SHong Zhang 6087d5f3da31SBarry Smith fact->factortype = MAT_FACTOR_ILU; 6088ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 6089bb3d539aSBarry Smith b->row = isrow; 6090bb3d539aSBarry Smith b->col = iscol; 6091bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6092bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6093bb3d539aSBarry Smith b->icol = isicol; 6094bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6095*26fbe8dcSKarl Rupp 6096b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 60976bce7ff8SHong Zhang PetscFunctionReturn(0); 60986bce7ff8SHong Zhang } 60996bce7ff8SHong Zhang 61006bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 61014e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 61024e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 61034e2b4712SSatish Balay 61044e2b4712SSatish Balay /* get new row pointers */ 6105690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 61064e2b4712SSatish Balay ainew[0] = 0; 61074e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 6108690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 6109690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 61104e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 6111690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 61124e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 6113690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 61144e2b4712SSatish Balay /* im is level for each filled value */ 6115690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 61164e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 6117690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 61184e2b4712SSatish Balay dloc[0] = 0; 61194e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 6120435faa5fSBarry Smith 6121435faa5fSBarry Smith /* copy prow into linked list */ 61224e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 6123e32f2f54SBarry Smith if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 61244e2b4712SSatish Balay xi = aj + ai[r[prow]]; 61254e2b4712SSatish Balay fill[n] = n; 6126435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 61274e2b4712SSatish Balay while (nz--) { 61284e2b4712SSatish Balay fm = n; 61294e2b4712SSatish Balay idx = ic[*xi++]; 61304e2b4712SSatish Balay do { 61314e2b4712SSatish Balay m = fm; 61324e2b4712SSatish Balay fm = fill[m]; 61334e2b4712SSatish Balay } while (fm < idx); 61344e2b4712SSatish Balay fill[m] = idx; 61354e2b4712SSatish Balay fill[idx] = fm; 61364e2b4712SSatish Balay im[idx] = 0; 61374e2b4712SSatish Balay } 6138435faa5fSBarry Smith 6139435faa5fSBarry Smith /* make sure diagonal entry is included */ 6140435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 6141435faa5fSBarry Smith fm = n; 6142435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 6143435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 6144435faa5fSBarry Smith fill[fm] = prow; 6145435faa5fSBarry Smith im[prow] = 0; 6146435faa5fSBarry Smith nzf++; 6147335d9088SBarry Smith dcount++; 6148435faa5fSBarry Smith } 6149435faa5fSBarry Smith 61504e2b4712SSatish Balay nzi = 0; 61514e2b4712SSatish Balay row = fill[n]; 61524e2b4712SSatish Balay while (row < prow) { 61534e2b4712SSatish Balay incrlev = im[row] + 1; 61544e2b4712SSatish Balay nz = dloc[row]; 6155435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 61564e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 61574e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 61584e2b4712SSatish Balay fm = row; 61594e2b4712SSatish Balay while (nnz-- > 0) { 61604e2b4712SSatish Balay idx = *xi++; 61614e2b4712SSatish Balay if (*flev + incrlev > levels) { 61624e2b4712SSatish Balay flev++; 61634e2b4712SSatish Balay continue; 61644e2b4712SSatish Balay } 61654e2b4712SSatish Balay do { 61664e2b4712SSatish Balay m = fm; 61674e2b4712SSatish Balay fm = fill[m]; 61684e2b4712SSatish Balay } while (fm < idx); 61694e2b4712SSatish Balay if (fm != idx) { 61704e2b4712SSatish Balay im[idx] = *flev + incrlev; 61714e2b4712SSatish Balay fill[m] = idx; 61724e2b4712SSatish Balay fill[idx] = fm; 61734e2b4712SSatish Balay fm = idx; 61744e2b4712SSatish Balay nzf++; 6175*26fbe8dcSKarl Rupp } else if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 61764e2b4712SSatish Balay flev++; 61774e2b4712SSatish Balay } 61784e2b4712SSatish Balay row = fill[row]; 61794e2b4712SSatish Balay nzi++; 61804e2b4712SSatish Balay } 61814e2b4712SSatish Balay /* copy new filled row into permanent storage */ 61824e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 61834e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 6184ecf371e4SBarry Smith 6185ecf371e4SBarry Smith /* estimate how much additional space we will need */ 6186ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 6187ecf371e4SBarry Smith /* just double the memory each time */ 6188690b6cddSBarry Smith PetscInt maxadd = jmax; 6189ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 61904e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 61914e2b4712SSatish Balay jmax += maxadd; 6192ecf371e4SBarry Smith 6193ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 61945d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 61955d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6196606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 61975d0c19d7SBarry Smith ajnew = xitmp; 61985d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 61995d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6200606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 62015d0c19d7SBarry Smith ajfill = xitmp; 6202eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 62034e2b4712SSatish Balay } 62045d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 62054e2b4712SSatish Balay flev = ajfill + ainew[prow]; 62064e2b4712SSatish Balay dloc[prow] = nzi; 62074e2b4712SSatish Balay fm = fill[n]; 62084e2b4712SSatish Balay while (nzf--) { 62095d0c19d7SBarry Smith *xitmp++ = fm; 62104e2b4712SSatish Balay *flev++ = im[fm]; 62114e2b4712SSatish Balay fm = fill[fm]; 62124e2b4712SSatish Balay } 6213435faa5fSBarry Smith /* make sure row has diagonal entry */ 6214f23aa3ddSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 62152401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 6216435faa5fSBarry Smith } 6217606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 62184e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 62194e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6220606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 6221606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 62224e2b4712SSatish Balay 62236cf91177SBarry Smith #if defined(PETSC_USE_INFO) 62244e2b4712SSatish Balay { 6225329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 6226ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 6227ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6228ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 6229ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6230335d9088SBarry Smith if (diagonal_fill) { 6231ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 6232335d9088SBarry Smith } 62334e2b4712SSatish Balay } 623463ba0a88SBarry Smith #endif 62354e2b4712SSatish Balay 62364e2b4712SSatish Balay /* put together the new matrix */ 6237719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6238719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6239ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 6240*26fbe8dcSKarl Rupp 6241e6b907acSBarry Smith b->free_a = PETSC_TRUE; 6242e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 62437c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 6244*26fbe8dcSKarl Rupp 6245a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 6246*26fbe8dcSKarl Rupp 62474e2b4712SSatish Balay b->j = ajnew; 62484e2b4712SSatish Balay b->i = ainew; 62494e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 62504e2b4712SSatish Balay b->diag = dloc; 62517f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 62524e2b4712SSatish Balay b->ilen = 0; 62534e2b4712SSatish Balay b->imax = 0; 62544e2b4712SSatish Balay b->row = isrow; 62554e2b4712SSatish Balay b->col = iscol; 6256bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6257*26fbe8dcSKarl Rupp 6258c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6259c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6260e51c0b9cSSatish Balay b->icol = isicol; 626187828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 62624e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 62634e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 6264719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 62654e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 62664e2b4712SSatish Balay 6267ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocate; 6268ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 6269ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 62706bce7ff8SHong Zhang 62718b1456e3SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 62728661488fSKris Buschelman PetscFunctionReturn(0); 62738661488fSKris Buschelman } 62748661488fSKris Buschelman 6275732ee342SKris Buschelman #undef __FUNCT__ 62767e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 6277dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 62787e7071cdSKris Buschelman { 627912272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 628012272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 62815fd66863SKarl Rupp 62825a9542e3SKris Buschelman PetscFunctionBegin; 62837cf1b8d3SKris Buschelman /* Undo Column scaling */ 62847cf1b8d3SKris Buschelman /* while (nz--) { */ 62857cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 62867cf1b8d3SKris Buschelman /* } */ 6287c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 6288c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 62897cf1b8d3SKris Buschelman PetscFunctionReturn(0); 62907cf1b8d3SKris Buschelman } 62917cf1b8d3SKris Buschelman 62927cf1b8d3SKris Buschelman #undef __FUNCT__ 62937cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 6294dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 62957cf1b8d3SKris Buschelman { 62967cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 6297b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 62982aa5897fSKris Buschelman unsigned short *aj=(unsigned short*)AJ; 62995fd66863SKarl Rupp 63005a9542e3SKris Buschelman PetscFunctionBegin; 63010b9da03eSKris Buschelman /* Is this really necessary? */ 630220235379SKris Buschelman while (nz--) { 63030b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 63047e7071cdSKris Buschelman } 6305c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 63067e7071cdSKris Buschelman PetscFunctionReturn(0); 63077e7071cdSKris Buschelman } 63087e7071cdSKris Buschelman 6309732ee342SKris Buschelman 6310