1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 34e2b4712SSatish Balay /* 44e2b4712SSatish Balay Factorization code for BAIJ format. 54e2b4712SSatish Balay */ 64e2b4712SSatish Balay 77c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 8c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 916a2bf60SHong Zhang #include "petscbt.h" 1016a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 114e2b4712SSatish Balay 124a2ae208SSatish Balay #undef __FUNCT__ 1393fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 1493fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 1593fd935bSShri Abhyankar { 1693fd935bSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1793fd935bSShri Abhyankar PetscErrorCode ierr; 1893fd935bSShri Abhyankar const PetscInt *adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 1993fd935bSShri Abhyankar PetscInt i,n = a->mbs,j; 2093fd935bSShri Abhyankar PetscInt nz; 2193fd935bSShri Abhyankar PetscScalar *x,*tmp,s1; 2293fd935bSShri Abhyankar const MatScalar *aa = a->a,*v; 2393fd935bSShri Abhyankar const PetscScalar *b; 2493fd935bSShri Abhyankar 2593fd935bSShri Abhyankar PetscFunctionBegin; 26*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2793fd935bSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2893fd935bSShri Abhyankar tmp = a->solve_work; 2993fd935bSShri Abhyankar 3093fd935bSShri Abhyankar 3193fd935bSShri Abhyankar /* copy the b into temp work space according to permutation */ 3293fd935bSShri Abhyankar for (i=0; i<n; i++) tmp[i] = b[i]; 3393fd935bSShri Abhyankar 3493fd935bSShri Abhyankar /* forward solve the U^T */ 3593fd935bSShri Abhyankar for (i=0; i<n; i++) { 3693fd935bSShri Abhyankar v = aa + adiag[i+1] + 1; 3793fd935bSShri Abhyankar vi = aj + adiag[i+1] + 1; 3893fd935bSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 3993fd935bSShri Abhyankar s1 = tmp[i]; 4093fd935bSShri Abhyankar s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 4193fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 4293fd935bSShri Abhyankar tmp[i] = s1; 4393fd935bSShri Abhyankar } 4493fd935bSShri Abhyankar 4593fd935bSShri Abhyankar /* backward solve the L^T */ 4693fd935bSShri Abhyankar for (i=n-1; i>=0; i--){ 4793fd935bSShri Abhyankar v = aa + ai[i]; 4893fd935bSShri Abhyankar vi = aj + ai[i]; 4993fd935bSShri Abhyankar nz = ai[i+1] - ai[i]; 5093fd935bSShri Abhyankar s1 = tmp[i]; 5193fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 5293fd935bSShri Abhyankar } 5393fd935bSShri Abhyankar 5493fd935bSShri Abhyankar /* copy tmp into x according to permutation */ 5593fd935bSShri Abhyankar for (i=0; i<n; i++) x[i] = tmp[i]; 5693fd935bSShri Abhyankar 57*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5893fd935bSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5993fd935bSShri Abhyankar 6093fd935bSShri Abhyankar ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 6193fd935bSShri Abhyankar PetscFunctionReturn(0); 6293fd935bSShri Abhyankar } 6393fd935bSShri Abhyankar 6493fd935bSShri Abhyankar #undef __FUNCT__ 6506e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 6606e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 67f1af5d2fSBarry Smith { 68f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 69dfbe8321SBarry Smith PetscErrorCode ierr; 700b68f018SBarry Smith PetscInt i,nz; 710b68f018SBarry Smith const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 720b68f018SBarry Smith const MatScalar *aa=a->a,*v; 730b68f018SBarry Smith PetscScalar s1,*x; 740b68f018SBarry Smith const PetscScalar *b; 75f1af5d2fSBarry Smith 76f1af5d2fSBarry Smith PetscFunctionBegin; 77ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 78*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 80f1af5d2fSBarry Smith 81f1af5d2fSBarry Smith /* forward solve the U^T */ 82f1af5d2fSBarry Smith for (i=0; i<n; i++) { 83f1af5d2fSBarry Smith 84f1af5d2fSBarry Smith v = aa + diag[i]; 85f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 86ef66eb69SBarry Smith s1 = (*v++)*x[i]; 87f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 88f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 89f1af5d2fSBarry Smith while (nz--) { 90f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 91f1af5d2fSBarry Smith } 92f1af5d2fSBarry Smith x[i] = s1; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith /* backward solve the L^T */ 95f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 96f1af5d2fSBarry Smith v = aa + diag[i] - 1; 97f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 98f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 99f1af5d2fSBarry Smith s1 = x[i]; 100f1af5d2fSBarry Smith while (nz--) { 101f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 102f1af5d2fSBarry Smith } 103f1af5d2fSBarry Smith } 104*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1051ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 106dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 107f1af5d2fSBarry Smith PetscFunctionReturn(0); 108f1af5d2fSBarry Smith } 109f1af5d2fSBarry Smith 1104a2ae208SSatish Balay #undef __FUNCT__ 11106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 11206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 113f1af5d2fSBarry Smith { 114f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 115dfbe8321SBarry Smith PetscErrorCode ierr; 116b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 117b3260449SShri Abhyankar const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 118b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 119b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x; 120b3260449SShri Abhyankar const PetscScalar *b; 121f1af5d2fSBarry Smith 122f1af5d2fSBarry Smith PetscFunctionBegin; 123ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 124*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1251ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 126f1af5d2fSBarry Smith 127f1af5d2fSBarry Smith /* forward solve the U^T */ 128f1af5d2fSBarry Smith idx = 0; 129f1af5d2fSBarry Smith for (i=0; i<n; i++) { 130f1af5d2fSBarry Smith 131f1af5d2fSBarry Smith v = aa + 4*diag[i]; 132f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 133ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 134f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 135f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 136f1af5d2fSBarry Smith v += 4; 137f1af5d2fSBarry Smith 138f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 139f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 140f1af5d2fSBarry Smith while (nz--) { 141f1af5d2fSBarry Smith oidx = 2*(*vi++); 142f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 143f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 144f1af5d2fSBarry Smith v += 4; 145f1af5d2fSBarry Smith } 146f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 147f1af5d2fSBarry Smith idx += 2; 148f1af5d2fSBarry Smith } 149f1af5d2fSBarry Smith /* backward solve the L^T */ 150f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 151f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 152f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 153f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 154f1af5d2fSBarry Smith idt = 2*i; 155f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 156f1af5d2fSBarry Smith while (nz--) { 157f1af5d2fSBarry Smith idx = 2*(*vi--); 158f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 159f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 160f1af5d2fSBarry Smith v -= 4; 161f1af5d2fSBarry Smith } 162f1af5d2fSBarry Smith } 163*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1641ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 165dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 166f1af5d2fSBarry Smith PetscFunctionReturn(0); 167f1af5d2fSBarry Smith } 168f1af5d2fSBarry Smith 1694a2ae208SSatish Balay #undef __FUNCT__ 1704dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 1714dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 1726929473cSShri Abhyankar { 1736929473cSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1746929473cSShri Abhyankar PetscErrorCode ierr; 175b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1766929473cSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 177b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 178b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 179b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x; 180b3260449SShri Abhyankar const PetscScalar *b; 1816929473cSShri Abhyankar 1826929473cSShri Abhyankar PetscFunctionBegin; 1836929473cSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 184*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1856929473cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1866929473cSShri Abhyankar 1876929473cSShri Abhyankar /* forward solve the U^T */ 1886929473cSShri Abhyankar idx = 0; 1896929473cSShri Abhyankar for (i=0; i<n; i++) { 1906929473cSShri Abhyankar v = aa + bs2*diag[i]; 1916929473cSShri Abhyankar /* multiply by the inverse of the block diagonal */ 1926929473cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 1936929473cSShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 1946929473cSShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 1956929473cSShri Abhyankar v -= bs2; 1966929473cSShri Abhyankar 1976929473cSShri Abhyankar vi = aj + diag[i] - 1; 1986929473cSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 1996929473cSShri Abhyankar for(j=0;j>-nz;j--){ 2006929473cSShri Abhyankar oidx = bs*vi[j]; 2016929473cSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2; 2026929473cSShri Abhyankar x[oidx+1] -= v[2]*s1 + v[3]*s2; 2036929473cSShri Abhyankar v -= bs2; 2046929473cSShri Abhyankar } 2056929473cSShri Abhyankar x[idx] = s1;x[1+idx] = s2; 2066929473cSShri Abhyankar idx += bs; 2076929473cSShri Abhyankar } 2086929473cSShri Abhyankar /* backward solve the L^T */ 2096929473cSShri Abhyankar for (i=n-1; i>=0; i--){ 2106929473cSShri Abhyankar v = aa + bs2*ai[i]; 2116929473cSShri Abhyankar vi = aj + ai[i]; 2126929473cSShri Abhyankar nz = ai[i+1] - ai[i]; 2136929473cSShri Abhyankar idt = bs*i; 2146929473cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 2156929473cSShri Abhyankar for(j=0;j<nz;j++){ 2166929473cSShri Abhyankar idx = bs*vi[j]; 2176929473cSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2; 2186929473cSShri Abhyankar x[idx+1] -= v[2]*s1 + v[3]*s2; 2196929473cSShri Abhyankar v += bs2; 2206929473cSShri Abhyankar } 2216929473cSShri Abhyankar } 222*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2236929473cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2246929473cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2256929473cSShri Abhyankar PetscFunctionReturn(0); 2266929473cSShri Abhyankar } 2276929473cSShri Abhyankar 2286929473cSShri Abhyankar #undef __FUNCT__ 22906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 23006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 231f1af5d2fSBarry Smith { 232f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 233dfbe8321SBarry Smith PetscErrorCode ierr; 234b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 235b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 236b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 237b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x; 238b3260449SShri Abhyankar const PetscScalar *b; 239f1af5d2fSBarry Smith 240f1af5d2fSBarry Smith PetscFunctionBegin; 241ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 242*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2431ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 244f1af5d2fSBarry Smith 245f1af5d2fSBarry Smith /* forward solve the U^T */ 246f1af5d2fSBarry Smith idx = 0; 247f1af5d2fSBarry Smith for (i=0; i<n; i++) { 248f1af5d2fSBarry Smith 249f1af5d2fSBarry Smith v = aa + 9*diag[i]; 250f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 251ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 252f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 253f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 254f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 255f1af5d2fSBarry Smith v += 9; 256f1af5d2fSBarry Smith 257f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 258f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 259f1af5d2fSBarry Smith while (nz--) { 260f1af5d2fSBarry Smith oidx = 3*(*vi++); 261f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 262f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 263f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 264f1af5d2fSBarry Smith v += 9; 265f1af5d2fSBarry Smith } 266f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 267f1af5d2fSBarry Smith idx += 3; 268f1af5d2fSBarry Smith } 269f1af5d2fSBarry Smith /* backward solve the L^T */ 270f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 271f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 272f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 273f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 274f1af5d2fSBarry Smith idt = 3*i; 275f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 276f1af5d2fSBarry Smith while (nz--) { 277f1af5d2fSBarry Smith idx = 3*(*vi--); 278f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 279f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 280f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 281f1af5d2fSBarry Smith v -= 9; 282f1af5d2fSBarry Smith } 283f1af5d2fSBarry Smith } 284*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2851ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 286dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 287f1af5d2fSBarry Smith PetscFunctionReturn(0); 288f1af5d2fSBarry Smith } 289f1af5d2fSBarry Smith 2904a2ae208SSatish Balay #undef __FUNCT__ 2914dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 2924dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2938499736aSShri Abhyankar { 2948499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2958499736aSShri Abhyankar PetscErrorCode ierr; 296b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 2978499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 298b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 299b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 300b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x; 301b3260449SShri Abhyankar const PetscScalar *b; 3028499736aSShri Abhyankar 3038499736aSShri Abhyankar PetscFunctionBegin; 3048499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 305*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3068499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3078499736aSShri Abhyankar 3088499736aSShri Abhyankar /* forward solve the U^T */ 3098499736aSShri Abhyankar idx = 0; 3108499736aSShri Abhyankar for (i=0; i<n; i++) { 3118499736aSShri Abhyankar v = aa + bs2*diag[i]; 3128499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 3138499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3148499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 3158499736aSShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 3168499736aSShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 3178499736aSShri Abhyankar v -= bs2; 3188499736aSShri Abhyankar 3198499736aSShri Abhyankar vi = aj + diag[i] - 1; 3208499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 3218499736aSShri Abhyankar for(j=0;j>-nz;j--){ 3228499736aSShri Abhyankar oidx = bs*vi[j]; 3238499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 3248499736aSShri Abhyankar x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 3258499736aSShri Abhyankar x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 3268499736aSShri Abhyankar v -= bs2; 3278499736aSShri Abhyankar } 3288499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 3298499736aSShri Abhyankar idx += bs; 3308499736aSShri Abhyankar } 3318499736aSShri Abhyankar /* backward solve the L^T */ 3328499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 3338499736aSShri Abhyankar v = aa + bs2*ai[i]; 3348499736aSShri Abhyankar vi = aj + ai[i]; 3358499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 3368499736aSShri Abhyankar idt = bs*i; 3378499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 3388499736aSShri Abhyankar for(j=0;j<nz;j++){ 3398499736aSShri Abhyankar idx = bs*vi[j]; 3408499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 3418499736aSShri Abhyankar x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 3428499736aSShri Abhyankar x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 3438499736aSShri Abhyankar v += bs2; 3448499736aSShri Abhyankar } 3458499736aSShri Abhyankar } 346*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3478499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3488499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3498499736aSShri Abhyankar PetscFunctionReturn(0); 3508499736aSShri Abhyankar } 3518499736aSShri Abhyankar 3528499736aSShri Abhyankar #undef __FUNCT__ 35306e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 35406e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 355f1af5d2fSBarry Smith { 356f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 357dfbe8321SBarry Smith PetscErrorCode ierr; 358b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 359b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 360b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 361b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 362b3260449SShri Abhyankar const PetscScalar *b; 363f1af5d2fSBarry Smith 364f1af5d2fSBarry Smith PetscFunctionBegin; 365ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 366*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3671ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 368f1af5d2fSBarry Smith 369f1af5d2fSBarry Smith /* forward solve the U^T */ 370f1af5d2fSBarry Smith idx = 0; 371f1af5d2fSBarry Smith for (i=0; i<n; i++) { 372f1af5d2fSBarry Smith 373f1af5d2fSBarry Smith v = aa + 16*diag[i]; 374f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 375ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 376f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 377f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 378f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 379f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 380f1af5d2fSBarry Smith v += 16; 381f1af5d2fSBarry Smith 382f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 383f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 384f1af5d2fSBarry Smith while (nz--) { 385f1af5d2fSBarry Smith oidx = 4*(*vi++); 386f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 387f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 388f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 389f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 390f1af5d2fSBarry Smith v += 16; 391f1af5d2fSBarry Smith } 392f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 393f1af5d2fSBarry Smith idx += 4; 394f1af5d2fSBarry Smith } 395f1af5d2fSBarry Smith /* backward solve the L^T */ 396f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 397f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 398f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 399f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 400f1af5d2fSBarry Smith idt = 4*i; 401f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 402f1af5d2fSBarry Smith while (nz--) { 403f1af5d2fSBarry Smith idx = 4*(*vi--); 404f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 405f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 406f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 407f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 408f1af5d2fSBarry Smith v -= 16; 409f1af5d2fSBarry Smith } 410f1af5d2fSBarry Smith } 411*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 413dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 414f1af5d2fSBarry Smith PetscFunctionReturn(0); 415f1af5d2fSBarry Smith } 416f1af5d2fSBarry Smith 4174a2ae208SSatish Balay #undef __FUNCT__ 4184dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 4194dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4208499736aSShri Abhyankar { 4218499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4228499736aSShri Abhyankar PetscErrorCode ierr; 423b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 4248499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 425b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 426b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 427b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 428b3260449SShri Abhyankar const PetscScalar *b; 4298499736aSShri Abhyankar 4308499736aSShri Abhyankar PetscFunctionBegin; 4318499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 432*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4338499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4348499736aSShri Abhyankar 4358499736aSShri Abhyankar /* forward solve the U^T */ 4368499736aSShri Abhyankar idx = 0; 4378499736aSShri Abhyankar for (i=0; i<n; i++) { 4388499736aSShri Abhyankar v = aa + bs2*diag[i]; 4398499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 4408499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 4418499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 4428499736aSShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 4438499736aSShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 4448499736aSShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 4458499736aSShri Abhyankar v -= bs2; 4468499736aSShri Abhyankar 4478499736aSShri Abhyankar vi = aj + diag[i] - 1; 4488499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 4498499736aSShri Abhyankar for(j=0;j>-nz;j--){ 4508499736aSShri Abhyankar oidx = bs*vi[j]; 4518499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4528499736aSShri Abhyankar x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4538499736aSShri Abhyankar x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4548499736aSShri Abhyankar x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4558499736aSShri Abhyankar v -= bs2; 4568499736aSShri Abhyankar } 4578499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 4588499736aSShri Abhyankar idx += bs; 4598499736aSShri Abhyankar } 4608499736aSShri Abhyankar /* backward solve the L^T */ 4618499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 4628499736aSShri Abhyankar v = aa + bs2*ai[i]; 4638499736aSShri Abhyankar vi = aj + ai[i]; 4648499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 4658499736aSShri Abhyankar idt = bs*i; 4668499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 4678499736aSShri Abhyankar for(j=0;j<nz;j++){ 4688499736aSShri Abhyankar idx = bs*vi[j]; 4698499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4708499736aSShri Abhyankar x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4718499736aSShri Abhyankar x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4728499736aSShri Abhyankar x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4738499736aSShri Abhyankar v += bs2; 4748499736aSShri Abhyankar } 4758499736aSShri Abhyankar } 476*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4778499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4788499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4798499736aSShri Abhyankar PetscFunctionReturn(0); 4808499736aSShri Abhyankar } 4818499736aSShri Abhyankar 4828499736aSShri Abhyankar #undef __FUNCT__ 48306e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 48406e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 485f1af5d2fSBarry Smith { 486f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 487dfbe8321SBarry Smith PetscErrorCode ierr; 488b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 489b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 490b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 491b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 492b3260449SShri Abhyankar const PetscScalar *b; 493f1af5d2fSBarry Smith 494f1af5d2fSBarry Smith PetscFunctionBegin; 495ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 496*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4971ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 498f1af5d2fSBarry Smith 499f1af5d2fSBarry Smith /* forward solve the U^T */ 500f1af5d2fSBarry Smith idx = 0; 501f1af5d2fSBarry Smith for (i=0; i<n; i++) { 502f1af5d2fSBarry Smith 503f1af5d2fSBarry Smith v = aa + 25*diag[i]; 504f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 505ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 506f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 507f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 508f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 509f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 510f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 511f1af5d2fSBarry Smith v += 25; 512f1af5d2fSBarry Smith 513f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 514f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 515f1af5d2fSBarry Smith while (nz--) { 516f1af5d2fSBarry Smith oidx = 5*(*vi++); 517f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 518f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 519f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 520f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 521f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 522f1af5d2fSBarry Smith v += 25; 523f1af5d2fSBarry Smith } 524f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 525f1af5d2fSBarry Smith idx += 5; 526f1af5d2fSBarry Smith } 527f1af5d2fSBarry Smith /* backward solve the L^T */ 528f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 529f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 530f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 531f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 532f1af5d2fSBarry Smith idt = 5*i; 533f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 534f1af5d2fSBarry Smith while (nz--) { 535f1af5d2fSBarry Smith idx = 5*(*vi--); 536f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 537f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 538f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 539f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 540f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 541f1af5d2fSBarry Smith v -= 25; 542f1af5d2fSBarry Smith } 543f1af5d2fSBarry Smith } 544*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5451ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 546dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 547f1af5d2fSBarry Smith PetscFunctionReturn(0); 548f1af5d2fSBarry Smith } 549f1af5d2fSBarry Smith 5504a2ae208SSatish Balay #undef __FUNCT__ 5514dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 5524dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 5538499736aSShri Abhyankar { 5548499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5558499736aSShri Abhyankar PetscErrorCode ierr; 556b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5578499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 558b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 559b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 560b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 561b3260449SShri Abhyankar const PetscScalar *b; 5628499736aSShri Abhyankar 5638499736aSShri Abhyankar PetscFunctionBegin; 5648499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 565*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5668499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5678499736aSShri Abhyankar 5688499736aSShri Abhyankar /* forward solve the U^T */ 5698499736aSShri Abhyankar idx = 0; 5708499736aSShri Abhyankar for (i=0; i<n; i++) { 5718499736aSShri Abhyankar v = aa + bs2*diag[i]; 5728499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 5738499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 5748499736aSShri Abhyankar x5 = x[4+idx]; 5758499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 5768499736aSShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 5778499736aSShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 5788499736aSShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 5798499736aSShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 5808499736aSShri Abhyankar v -= bs2; 5818499736aSShri Abhyankar 5828499736aSShri Abhyankar vi = aj + diag[i] - 1; 5838499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 5848499736aSShri Abhyankar for(j=0;j>-nz;j--){ 5858499736aSShri Abhyankar oidx = bs*vi[j]; 5868499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 5878499736aSShri Abhyankar x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 5888499736aSShri Abhyankar x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 5898499736aSShri Abhyankar x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 5908499736aSShri Abhyankar x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 5918499736aSShri Abhyankar v -= bs2; 5928499736aSShri Abhyankar } 5938499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 5948499736aSShri Abhyankar idx += bs; 5958499736aSShri Abhyankar } 5968499736aSShri Abhyankar /* backward solve the L^T */ 5978499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 5988499736aSShri Abhyankar v = aa + bs2*ai[i]; 5998499736aSShri Abhyankar vi = aj + ai[i]; 6008499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 6018499736aSShri Abhyankar idt = bs*i; 6028499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 6038499736aSShri Abhyankar for(j=0;j<nz;j++){ 6048499736aSShri Abhyankar idx = bs*vi[j]; 6058499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 6068499736aSShri Abhyankar x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 6078499736aSShri Abhyankar x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 6088499736aSShri Abhyankar x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 6098499736aSShri Abhyankar x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 6108499736aSShri Abhyankar v += bs2; 6118499736aSShri Abhyankar } 6128499736aSShri Abhyankar } 613*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 6148499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 6158499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 6168499736aSShri Abhyankar PetscFunctionReturn(0); 6178499736aSShri Abhyankar } 6188499736aSShri Abhyankar 6198499736aSShri Abhyankar #undef __FUNCT__ 62006e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 62106e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 622f1af5d2fSBarry Smith { 623f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 624dfbe8321SBarry Smith PetscErrorCode ierr; 625b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 626b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 627b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 628b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 629b3260449SShri Abhyankar const PetscScalar *b; 630f1af5d2fSBarry Smith 631f1af5d2fSBarry Smith PetscFunctionBegin; 632ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 633*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 6341ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 635f1af5d2fSBarry Smith 636f1af5d2fSBarry Smith /* forward solve the U^T */ 637f1af5d2fSBarry Smith idx = 0; 638f1af5d2fSBarry Smith for (i=0; i<n; i++) { 639f1af5d2fSBarry Smith 640f1af5d2fSBarry Smith v = aa + 36*diag[i]; 641f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 642ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 643ef66eb69SBarry Smith x6 = x[5+idx]; 644f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 645f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 646f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 647f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 648f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 649f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 650f1af5d2fSBarry Smith v += 36; 651f1af5d2fSBarry Smith 652f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 653f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 654f1af5d2fSBarry Smith while (nz--) { 655f1af5d2fSBarry Smith oidx = 6*(*vi++); 656f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 657f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 658f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 659f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 660f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 661f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 662f1af5d2fSBarry Smith v += 36; 663f1af5d2fSBarry Smith } 664f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 665f1af5d2fSBarry Smith x[5+idx] = s6; 666f1af5d2fSBarry Smith idx += 6; 667f1af5d2fSBarry Smith } 668f1af5d2fSBarry Smith /* backward solve the L^T */ 669f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 670f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 671f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 672f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 673f1af5d2fSBarry Smith idt = 6*i; 674f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 675f1af5d2fSBarry Smith s6 = x[5+idt]; 676f1af5d2fSBarry Smith while (nz--) { 677f1af5d2fSBarry Smith idx = 6*(*vi--); 678f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 679f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 680f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 681f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 682f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 683f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 684f1af5d2fSBarry Smith v -= 36; 685f1af5d2fSBarry Smith } 686f1af5d2fSBarry Smith } 687*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 6881ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 689dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 690f1af5d2fSBarry Smith PetscFunctionReturn(0); 691f1af5d2fSBarry Smith } 692f1af5d2fSBarry Smith 6934a2ae208SSatish Balay #undef __FUNCT__ 6944dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 6954dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 6968499736aSShri Abhyankar { 6978499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 6988499736aSShri Abhyankar PetscErrorCode ierr; 699b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 7008499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 701b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 702b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 703b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 704b3260449SShri Abhyankar const PetscScalar *b; 7058499736aSShri Abhyankar 7068499736aSShri Abhyankar PetscFunctionBegin; 7078499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 708*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 7098499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 7108499736aSShri Abhyankar 7118499736aSShri Abhyankar /* forward solve the U^T */ 7128499736aSShri Abhyankar idx = 0; 7138499736aSShri Abhyankar for (i=0; i<n; i++) { 7148499736aSShri Abhyankar v = aa + bs2*diag[i]; 7158499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 7168499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 7178499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; 7188499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 7198499736aSShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 7208499736aSShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 7218499736aSShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 7228499736aSShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 7238499736aSShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 7248499736aSShri Abhyankar v -= bs2; 7258499736aSShri Abhyankar 7268499736aSShri Abhyankar vi = aj + diag[i] - 1; 7278499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 7288499736aSShri Abhyankar for(j=0;j>-nz;j--){ 7298499736aSShri Abhyankar oidx = bs*vi[j]; 7308499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 7318499736aSShri Abhyankar x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 7328499736aSShri Abhyankar x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 7338499736aSShri Abhyankar x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 7348499736aSShri Abhyankar x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 7358499736aSShri Abhyankar x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 7368499736aSShri Abhyankar v -= bs2; 7378499736aSShri Abhyankar } 7388499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 7398499736aSShri Abhyankar x[5+idx] = s6; 7408499736aSShri Abhyankar idx += bs; 7418499736aSShri Abhyankar } 7428499736aSShri Abhyankar /* backward solve the L^T */ 7438499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 7448499736aSShri Abhyankar v = aa + bs2*ai[i]; 7458499736aSShri Abhyankar vi = aj + ai[i]; 7468499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 7478499736aSShri Abhyankar idt = bs*i; 7488499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 7498499736aSShri Abhyankar s6 = x[5+idt]; 7508499736aSShri Abhyankar for(j=0;j<nz;j++){ 7518499736aSShri Abhyankar idx = bs*vi[j]; 7528499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 7538499736aSShri Abhyankar x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 7548499736aSShri Abhyankar x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 7558499736aSShri Abhyankar x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 7568499736aSShri Abhyankar x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 7578499736aSShri Abhyankar x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 7588499736aSShri Abhyankar v += bs2; 7598499736aSShri Abhyankar } 7608499736aSShri Abhyankar } 761*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 7628499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 7638499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 7648499736aSShri Abhyankar PetscFunctionReturn(0); 7658499736aSShri Abhyankar } 7668499736aSShri Abhyankar 7678499736aSShri Abhyankar #undef __FUNCT__ 76806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 76906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 770f1af5d2fSBarry Smith { 771f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 772dfbe8321SBarry Smith PetscErrorCode ierr; 773b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 774b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 775b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 776b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 777b3260449SShri Abhyankar const PetscScalar *b; 778f1af5d2fSBarry Smith 779f1af5d2fSBarry Smith PetscFunctionBegin; 780ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 781*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 7821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 783f1af5d2fSBarry Smith 784f1af5d2fSBarry Smith /* forward solve the U^T */ 785f1af5d2fSBarry Smith idx = 0; 786f1af5d2fSBarry Smith for (i=0; i<n; i++) { 787f1af5d2fSBarry Smith 788f1af5d2fSBarry Smith v = aa + 49*diag[i]; 789f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 790ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 791ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 792f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 793f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 794f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 795f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 796f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 797f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 798f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 799f1af5d2fSBarry Smith v += 49; 800f1af5d2fSBarry Smith 801f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 802f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 803f1af5d2fSBarry Smith while (nz--) { 804f1af5d2fSBarry Smith oidx = 7*(*vi++); 805f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 806f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 807f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 808f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 809f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 810f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 811f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 812f1af5d2fSBarry Smith v += 49; 813f1af5d2fSBarry Smith } 814f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 815f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 816f1af5d2fSBarry Smith idx += 7; 817f1af5d2fSBarry Smith } 818f1af5d2fSBarry Smith /* backward solve the L^T */ 819f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 820f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 821f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 822f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 823f1af5d2fSBarry Smith idt = 7*i; 824f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 825f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 826f1af5d2fSBarry Smith while (nz--) { 827f1af5d2fSBarry Smith idx = 7*(*vi--); 828f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 829f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 830f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 831f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 832f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 833f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 834f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 835f1af5d2fSBarry Smith v -= 49; 836f1af5d2fSBarry Smith } 837f1af5d2fSBarry Smith } 838*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 8391ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 840dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 841f1af5d2fSBarry Smith PetscFunctionReturn(0); 842f1af5d2fSBarry Smith } 8438499736aSShri Abhyankar #undef __FUNCT__ 8444dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 8454dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 8468499736aSShri Abhyankar { 8478499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 8488499736aSShri Abhyankar PetscErrorCode ierr; 849b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 8508499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 851b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 852b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 853b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 854b3260449SShri Abhyankar const PetscScalar *b; 8558499736aSShri Abhyankar 8568499736aSShri Abhyankar PetscFunctionBegin; 8578499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 858*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 8598499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 8608499736aSShri Abhyankar 8618499736aSShri Abhyankar /* forward solve the U^T */ 8628499736aSShri Abhyankar idx = 0; 8638499736aSShri Abhyankar for (i=0; i<n; i++) { 8648499736aSShri Abhyankar v = aa + bs2*diag[i]; 8658499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 8668499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 8678499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 8688499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 8698499736aSShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 8708499736aSShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 8718499736aSShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 8728499736aSShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 8738499736aSShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 8748499736aSShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 8758499736aSShri Abhyankar v -= bs2; 8768499736aSShri Abhyankar vi = aj + diag[i] - 1; 8778499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 8788499736aSShri Abhyankar for(j=0;j>-nz;j--){ 8798499736aSShri Abhyankar oidx = bs*vi[j]; 8808499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 8818499736aSShri Abhyankar x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 8828499736aSShri Abhyankar x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 8838499736aSShri Abhyankar x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 8848499736aSShri Abhyankar x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 8858499736aSShri Abhyankar x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 8868499736aSShri Abhyankar x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 8878499736aSShri Abhyankar v -= bs2; 8888499736aSShri Abhyankar } 8898499736aSShri Abhyankar x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 8908499736aSShri Abhyankar x[5+idx] = s6; x[6+idx] = s7; 8918499736aSShri Abhyankar idx += bs; 8928499736aSShri Abhyankar } 8938499736aSShri Abhyankar /* backward solve the L^T */ 8948499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 8958499736aSShri Abhyankar v = aa + bs2*ai[i]; 8968499736aSShri Abhyankar vi = aj + ai[i]; 8978499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 8988499736aSShri Abhyankar idt = bs*i; 8998499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 9008499736aSShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; 9018499736aSShri Abhyankar for(j=0;j<nz;j++){ 9028499736aSShri Abhyankar idx = bs*vi[j]; 9038499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 9048499736aSShri Abhyankar x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 9058499736aSShri Abhyankar x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 9068499736aSShri Abhyankar x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 9078499736aSShri Abhyankar x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 9088499736aSShri Abhyankar x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 9098499736aSShri Abhyankar x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 9108499736aSShri Abhyankar v += bs2; 9118499736aSShri Abhyankar } 9128499736aSShri Abhyankar } 913*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 9148499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 9158499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 9168499736aSShri Abhyankar PetscFunctionReturn(0); 9178499736aSShri Abhyankar } 918f1af5d2fSBarry Smith 919f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 9204a2ae208SSatish Balay #undef __FUNCT__ 92193fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 92293fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 92393fd935bSShri Abhyankar { 92493fd935bSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 92593fd935bSShri Abhyankar IS iscol = a->col,isrow = a->row; 92693fd935bSShri Abhyankar PetscErrorCode ierr; 92793fd935bSShri Abhyankar const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 92893fd935bSShri Abhyankar PetscInt i,n = a->mbs,j; 92993fd935bSShri Abhyankar PetscInt nz; 93093fd935bSShri Abhyankar PetscScalar *x,*tmp,s1; 93193fd935bSShri Abhyankar const MatScalar *aa = a->a,*v; 93293fd935bSShri Abhyankar const PetscScalar *b; 93393fd935bSShri Abhyankar 93493fd935bSShri Abhyankar PetscFunctionBegin; 935*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 93693fd935bSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 93793fd935bSShri Abhyankar tmp = a->solve_work; 93893fd935bSShri Abhyankar 93993fd935bSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 94093fd935bSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 94193fd935bSShri Abhyankar 94293fd935bSShri Abhyankar /* copy the b into temp work space according to permutation */ 94393fd935bSShri Abhyankar for (i=0; i<n; i++) tmp[i] = b[c[i]]; 94493fd935bSShri Abhyankar 94593fd935bSShri Abhyankar /* forward solve the U^T */ 94693fd935bSShri Abhyankar for (i=0; i<n; i++) { 94793fd935bSShri Abhyankar v = aa + adiag[i+1] + 1; 94893fd935bSShri Abhyankar vi = aj + adiag[i+1] + 1; 94993fd935bSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 95093fd935bSShri Abhyankar s1 = tmp[i]; 95193fd935bSShri Abhyankar s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 95293fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 95393fd935bSShri Abhyankar tmp[i] = s1; 95493fd935bSShri Abhyankar } 95593fd935bSShri Abhyankar 95693fd935bSShri Abhyankar /* backward solve the L^T */ 95793fd935bSShri Abhyankar for (i=n-1; i>=0; i--){ 95893fd935bSShri Abhyankar v = aa + ai[i]; 95993fd935bSShri Abhyankar vi = aj + ai[i]; 96093fd935bSShri Abhyankar nz = ai[i+1] - ai[i]; 96193fd935bSShri Abhyankar s1 = tmp[i]; 96293fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 96393fd935bSShri Abhyankar } 96493fd935bSShri Abhyankar 96593fd935bSShri Abhyankar /* copy tmp into x according to permutation */ 96693fd935bSShri Abhyankar for (i=0; i<n; i++) x[r[i]] = tmp[i]; 96793fd935bSShri Abhyankar 96893fd935bSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 96993fd935bSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 970*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 97193fd935bSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 97293fd935bSShri Abhyankar 97393fd935bSShri Abhyankar ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 97493fd935bSShri Abhyankar PetscFunctionReturn(0); 97593fd935bSShri Abhyankar } 97693fd935bSShri Abhyankar 97793fd935bSShri Abhyankar #undef __FUNCT__ 97806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 97906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 980f1af5d2fSBarry Smith { 981f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 982f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9836849ba73SBarry Smith PetscErrorCode ierr; 9845d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 985b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 986b3260449SShri Abhyankar PetscInt i,nz; 987b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 988b3260449SShri Abhyankar PetscScalar s1,*x,*t; 989b3260449SShri Abhyankar const PetscScalar *b; 990f1af5d2fSBarry Smith 991f1af5d2fSBarry Smith PetscFunctionBegin; 992*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 9931ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 994f1af5d2fSBarry Smith t = a->solve_work; 995f1af5d2fSBarry Smith 996f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 997f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 998f1af5d2fSBarry Smith 999f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1000f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1001f1af5d2fSBarry Smith t[i] = b[c[i]]; 1002f1af5d2fSBarry Smith } 1003f1af5d2fSBarry Smith 1004f1af5d2fSBarry Smith /* forward solve the U^T */ 1005f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1006f1af5d2fSBarry Smith 1007f1af5d2fSBarry Smith v = aa + diag[i]; 1008f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1009f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 1010f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1011f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1012f1af5d2fSBarry Smith while (nz--) { 1013f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 1014f1af5d2fSBarry Smith } 1015f1af5d2fSBarry Smith t[i] = s1; 1016f1af5d2fSBarry Smith } 1017f1af5d2fSBarry Smith /* backward solve the L^T */ 1018f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1019f1af5d2fSBarry Smith v = aa + diag[i] - 1; 1020f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1021f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1022f1af5d2fSBarry Smith s1 = t[i]; 1023f1af5d2fSBarry Smith while (nz--) { 1024f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 1025f1af5d2fSBarry Smith } 1026f1af5d2fSBarry Smith } 1027f1af5d2fSBarry Smith 1028f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1029f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1030f1af5d2fSBarry Smith x[r[i]] = t[i]; 1031f1af5d2fSBarry Smith } 1032f1af5d2fSBarry Smith 1033f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1034f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1035*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 10361ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1037dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 1038f1af5d2fSBarry Smith PetscFunctionReturn(0); 1039f1af5d2fSBarry Smith } 1040f1af5d2fSBarry Smith 10414a2ae208SSatish Balay #undef __FUNCT__ 104206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 104306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1044f1af5d2fSBarry Smith { 1045f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1046f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10476849ba73SBarry Smith PetscErrorCode ierr; 10485d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1049b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1050b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1051b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1052b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x,*t; 1053b3260449SShri Abhyankar const PetscScalar *b; 1054f1af5d2fSBarry Smith 1055f1af5d2fSBarry Smith PetscFunctionBegin; 1056*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 10571ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1058f1af5d2fSBarry Smith t = a->solve_work; 1059f1af5d2fSBarry Smith 1060f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1061f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1062f1af5d2fSBarry Smith 1063f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1064f1af5d2fSBarry Smith ii = 0; 1065f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1066f1af5d2fSBarry Smith ic = 2*c[i]; 1067f1af5d2fSBarry Smith t[ii] = b[ic]; 1068f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1069f1af5d2fSBarry Smith ii += 2; 1070f1af5d2fSBarry Smith } 1071f1af5d2fSBarry Smith 1072f1af5d2fSBarry Smith /* forward solve the U^T */ 1073f1af5d2fSBarry Smith idx = 0; 1074f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1075f1af5d2fSBarry Smith 1076f1af5d2fSBarry Smith v = aa + 4*diag[i]; 1077f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1078f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1079f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 1080f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 1081f1af5d2fSBarry Smith v += 4; 1082f1af5d2fSBarry Smith 1083f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1084f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1085f1af5d2fSBarry Smith while (nz--) { 1086f1af5d2fSBarry Smith oidx = 2*(*vi++); 1087f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 1088f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 1089f1af5d2fSBarry Smith v += 4; 1090f1af5d2fSBarry Smith } 1091f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1092f1af5d2fSBarry Smith idx += 2; 1093f1af5d2fSBarry Smith } 1094f1af5d2fSBarry Smith /* backward solve the L^T */ 1095f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1096f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 1097f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1098f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1099f1af5d2fSBarry Smith idt = 2*i; 1100f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1101f1af5d2fSBarry Smith while (nz--) { 1102f1af5d2fSBarry Smith idx = 2*(*vi--); 1103f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 1104f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 1105f1af5d2fSBarry Smith v -= 4; 1106f1af5d2fSBarry Smith } 1107f1af5d2fSBarry Smith } 1108f1af5d2fSBarry Smith 1109f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1110f1af5d2fSBarry Smith ii = 0; 1111f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1112f1af5d2fSBarry Smith ir = 2*r[i]; 1113f1af5d2fSBarry Smith x[ir] = t[ii]; 1114f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1115f1af5d2fSBarry Smith ii += 2; 1116f1af5d2fSBarry Smith } 1117f1af5d2fSBarry Smith 1118f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1119f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1120*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 11211ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1122dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1123f1af5d2fSBarry Smith PetscFunctionReturn(0); 1124f1af5d2fSBarry Smith } 1125f1af5d2fSBarry Smith 11264a2ae208SSatish Balay #undef __FUNCT__ 11274dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 11284dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 112932121132SShri Abhyankar { 113032121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 113132121132SShri Abhyankar PetscErrorCode ierr; 113232121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1133b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 113432121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 113532121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1136b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1137b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1138b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x,*t; 1139b3260449SShri Abhyankar const PetscScalar *b; 114032121132SShri Abhyankar 114132121132SShri Abhyankar PetscFunctionBegin; 1142*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 114332121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 114432121132SShri Abhyankar t = a->solve_work; 114532121132SShri Abhyankar 114632121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 114732121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 114832121132SShri Abhyankar 114932121132SShri Abhyankar /* copy b into temp work space according to permutation */ 115032121132SShri Abhyankar for(i=0;i<n;i++){ 115132121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 115232121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; 115332121132SShri Abhyankar } 115432121132SShri Abhyankar 115532121132SShri Abhyankar /* forward solve the U^T */ 115632121132SShri Abhyankar idx = 0; 115732121132SShri Abhyankar for (i=0; i<n; i++) { 115832121132SShri Abhyankar v = aa + bs2*diag[i]; 115932121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 116032121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 116132121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 116232121132SShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 116332121132SShri Abhyankar v -= bs2; 116432121132SShri Abhyankar 116532121132SShri Abhyankar vi = aj + diag[i] - 1; 116632121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 116732121132SShri Abhyankar for(j=0;j>-nz;j--){ 116832121132SShri Abhyankar oidx = bs*vi[j]; 116932121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2; 117032121132SShri Abhyankar t[oidx+1] -= v[2]*s1 + v[3]*s2; 117132121132SShri Abhyankar v -= bs2; 117232121132SShri Abhyankar } 117332121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 117432121132SShri Abhyankar idx += bs; 117532121132SShri Abhyankar } 117632121132SShri Abhyankar /* backward solve the L^T */ 117732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 117832121132SShri Abhyankar v = aa + bs2*ai[i]; 117932121132SShri Abhyankar vi = aj + ai[i]; 118032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 118132121132SShri Abhyankar idt = bs*i; 118232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 118332121132SShri Abhyankar for(j=0;j<nz;j++){ 118432121132SShri Abhyankar idx = bs*vi[j]; 118532121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2; 118632121132SShri Abhyankar t[idx+1] -= v[2]*s1 + v[3]*s2; 118732121132SShri Abhyankar v += bs2; 118832121132SShri Abhyankar } 118932121132SShri Abhyankar } 119032121132SShri Abhyankar 119132121132SShri Abhyankar /* copy t into x according to permutation */ 119232121132SShri Abhyankar for(i=0;i<n;i++){ 119332121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 119432121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; 119532121132SShri Abhyankar } 119632121132SShri Abhyankar 119732121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 119832121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1199*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 120032121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 120132121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 120232121132SShri Abhyankar PetscFunctionReturn(0); 120332121132SShri Abhyankar } 120432121132SShri Abhyankar 120532121132SShri Abhyankar #undef __FUNCT__ 120606e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 120706e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1208f1af5d2fSBarry Smith { 1209f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1210f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 12116849ba73SBarry Smith PetscErrorCode ierr; 12125d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1213b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1214b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1215b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1216b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1217b3260449SShri Abhyankar const PetscScalar *b; 1218f1af5d2fSBarry Smith 1219f1af5d2fSBarry Smith PetscFunctionBegin; 1220*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 12211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1222f1af5d2fSBarry Smith t = a->solve_work; 1223f1af5d2fSBarry Smith 1224f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1225f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1226f1af5d2fSBarry Smith 1227f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1228f1af5d2fSBarry Smith ii = 0; 1229f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1230f1af5d2fSBarry Smith ic = 3*c[i]; 1231f1af5d2fSBarry Smith t[ii] = b[ic]; 1232f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1233f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1234f1af5d2fSBarry Smith ii += 3; 1235f1af5d2fSBarry Smith } 1236f1af5d2fSBarry Smith 1237f1af5d2fSBarry Smith /* forward solve the U^T */ 1238f1af5d2fSBarry Smith idx = 0; 1239f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1240f1af5d2fSBarry Smith 1241f1af5d2fSBarry Smith v = aa + 9*diag[i]; 1242f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1243f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1244f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1245f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1246f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1247f1af5d2fSBarry Smith v += 9; 1248f1af5d2fSBarry Smith 1249f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1250f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1251f1af5d2fSBarry Smith while (nz--) { 1252f1af5d2fSBarry Smith oidx = 3*(*vi++); 1253f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1254f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1255f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1256f1af5d2fSBarry Smith v += 9; 1257f1af5d2fSBarry Smith } 1258f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1259f1af5d2fSBarry Smith idx += 3; 1260f1af5d2fSBarry Smith } 1261f1af5d2fSBarry Smith /* backward solve the L^T */ 1262f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1263f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 1264f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1265f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1266f1af5d2fSBarry Smith idt = 3*i; 1267f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1268f1af5d2fSBarry Smith while (nz--) { 1269f1af5d2fSBarry Smith idx = 3*(*vi--); 1270f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1271f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1272f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1273f1af5d2fSBarry Smith v -= 9; 1274f1af5d2fSBarry Smith } 1275f1af5d2fSBarry Smith } 1276f1af5d2fSBarry Smith 1277f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1278f1af5d2fSBarry Smith ii = 0; 1279f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1280f1af5d2fSBarry Smith ir = 3*r[i]; 1281f1af5d2fSBarry Smith x[ir] = t[ii]; 1282f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1283f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1284f1af5d2fSBarry Smith ii += 3; 1285f1af5d2fSBarry Smith } 1286f1af5d2fSBarry Smith 1287f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1288f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1289*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 12901ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1291dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1292f1af5d2fSBarry Smith PetscFunctionReturn(0); 1293f1af5d2fSBarry Smith } 1294f1af5d2fSBarry Smith 12954a2ae208SSatish Balay #undef __FUNCT__ 12964dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 12974dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 129832121132SShri Abhyankar { 129932121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 130032121132SShri Abhyankar PetscErrorCode ierr; 130132121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1302b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 130332121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 130432121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1305b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1306b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1307b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1308b3260449SShri Abhyankar const PetscScalar *b; 130932121132SShri Abhyankar 131032121132SShri Abhyankar PetscFunctionBegin; 1311*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 131232121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 131332121132SShri Abhyankar t = a->solve_work; 131432121132SShri Abhyankar 131532121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 131632121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 131732121132SShri Abhyankar 131832121132SShri Abhyankar /* copy b into temp work space according to permutation */ 131932121132SShri Abhyankar for(i=0;i<n;i++){ 132032121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 132132121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 132232121132SShri Abhyankar } 132332121132SShri Abhyankar 132432121132SShri Abhyankar /* forward solve the U^T */ 132532121132SShri Abhyankar idx = 0; 132632121132SShri Abhyankar for (i=0; i<n; i++) { 132732121132SShri Abhyankar v = aa + bs2*diag[i]; 132832121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 132932121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 133032121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 133132121132SShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 133232121132SShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 133332121132SShri Abhyankar v -= bs2; 133432121132SShri Abhyankar 133532121132SShri Abhyankar vi = aj + diag[i] - 1; 133632121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 133732121132SShri Abhyankar for(j=0;j>-nz;j--){ 133832121132SShri Abhyankar oidx = bs*vi[j]; 133932121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 134032121132SShri Abhyankar t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 134132121132SShri Abhyankar t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 134232121132SShri Abhyankar v -= bs2; 134332121132SShri Abhyankar } 134432121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 134532121132SShri Abhyankar idx += bs; 134632121132SShri Abhyankar } 134732121132SShri Abhyankar /* backward solve the L^T */ 134832121132SShri Abhyankar for (i=n-1; i>=0; i--){ 134932121132SShri Abhyankar v = aa + bs2*ai[i]; 135032121132SShri Abhyankar vi = aj + ai[i]; 135132121132SShri Abhyankar nz = ai[i+1] - ai[i]; 135232121132SShri Abhyankar idt = bs*i; 135332121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 135432121132SShri Abhyankar for(j=0;j<nz;j++){ 135532121132SShri Abhyankar idx = bs*vi[j]; 135632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 135732121132SShri Abhyankar t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 135832121132SShri Abhyankar t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 135932121132SShri Abhyankar v += bs2; 136032121132SShri Abhyankar } 136132121132SShri Abhyankar } 136232121132SShri Abhyankar 136332121132SShri Abhyankar /* copy t into x according to permutation */ 136432121132SShri Abhyankar for(i=0;i<n;i++){ 136532121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 136632121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 136732121132SShri Abhyankar } 136832121132SShri Abhyankar 136932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 137032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1371*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 137232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 137332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 137432121132SShri Abhyankar PetscFunctionReturn(0); 137532121132SShri Abhyankar } 137632121132SShri Abhyankar 137732121132SShri Abhyankar #undef __FUNCT__ 137806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 137906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1380f1af5d2fSBarry Smith { 1381f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1382f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 13836849ba73SBarry Smith PetscErrorCode ierr; 13845d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1385b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1386b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1387b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1388b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1389b3260449SShri Abhyankar const PetscScalar *b; 1390f1af5d2fSBarry Smith 1391f1af5d2fSBarry Smith PetscFunctionBegin; 1392*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 13931ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1394f1af5d2fSBarry Smith t = a->solve_work; 1395f1af5d2fSBarry Smith 1396f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1397f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1398f1af5d2fSBarry Smith 1399f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1400f1af5d2fSBarry Smith ii = 0; 1401f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1402f1af5d2fSBarry Smith ic = 4*c[i]; 1403f1af5d2fSBarry Smith t[ii] = b[ic]; 1404f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1405f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1406f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1407f1af5d2fSBarry Smith ii += 4; 1408f1af5d2fSBarry Smith } 1409f1af5d2fSBarry Smith 1410f1af5d2fSBarry Smith /* forward solve the U^T */ 1411f1af5d2fSBarry Smith idx = 0; 1412f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1413f1af5d2fSBarry Smith 1414f1af5d2fSBarry Smith v = aa + 16*diag[i]; 1415f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1416f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1417f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1418f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1419f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1420f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1421f1af5d2fSBarry Smith v += 16; 1422f1af5d2fSBarry Smith 1423f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1424f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1425f1af5d2fSBarry Smith while (nz--) { 1426f1af5d2fSBarry Smith oidx = 4*(*vi++); 1427f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1428f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1429f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1430f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1431f1af5d2fSBarry Smith v += 16; 1432f1af5d2fSBarry Smith } 1433f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1434f1af5d2fSBarry Smith idx += 4; 1435f1af5d2fSBarry Smith } 1436f1af5d2fSBarry Smith /* backward solve the L^T */ 1437f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1438f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 1439f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1440f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1441f1af5d2fSBarry Smith idt = 4*i; 1442f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1443f1af5d2fSBarry Smith while (nz--) { 1444f1af5d2fSBarry Smith idx = 4*(*vi--); 1445f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1446f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1447f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1448f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1449f1af5d2fSBarry Smith v -= 16; 1450f1af5d2fSBarry Smith } 1451f1af5d2fSBarry Smith } 1452f1af5d2fSBarry Smith 1453f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1454f1af5d2fSBarry Smith ii = 0; 1455f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1456f1af5d2fSBarry Smith ir = 4*r[i]; 1457f1af5d2fSBarry Smith x[ir] = t[ii]; 1458f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1459f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1460f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1461f1af5d2fSBarry Smith ii += 4; 1462f1af5d2fSBarry Smith } 1463f1af5d2fSBarry Smith 1464f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1465f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1466*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 14671ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1468dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1469f1af5d2fSBarry Smith PetscFunctionReturn(0); 1470f1af5d2fSBarry Smith } 1471f1af5d2fSBarry Smith 14724a2ae208SSatish Balay #undef __FUNCT__ 14734dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 14744dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 147532121132SShri Abhyankar { 147632121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 147732121132SShri Abhyankar PetscErrorCode ierr; 147832121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1479b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 148032121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 148132121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1482b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1483b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1484b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1485b3260449SShri Abhyankar const PetscScalar *b; 148632121132SShri Abhyankar 148732121132SShri Abhyankar PetscFunctionBegin; 1488*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 148932121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 149032121132SShri Abhyankar t = a->solve_work; 149132121132SShri Abhyankar 149232121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 149332121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 149432121132SShri Abhyankar 149532121132SShri Abhyankar /* copy b into temp work space according to permutation */ 149632121132SShri Abhyankar for(i=0;i<n;i++){ 149732121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 149832121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 149932121132SShri Abhyankar } 150032121132SShri Abhyankar 150132121132SShri Abhyankar /* forward solve the U^T */ 150232121132SShri Abhyankar idx = 0; 150332121132SShri Abhyankar for (i=0; i<n; i++) { 150432121132SShri Abhyankar v = aa + bs2*diag[i]; 150532121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 150632121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 150732121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 150832121132SShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 150932121132SShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 151032121132SShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 151132121132SShri Abhyankar v -= bs2; 151232121132SShri Abhyankar 151332121132SShri Abhyankar vi = aj + diag[i] - 1; 151432121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 151532121132SShri Abhyankar for(j=0;j>-nz;j--){ 151632121132SShri Abhyankar oidx = bs*vi[j]; 151732121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 151832121132SShri Abhyankar t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 151932121132SShri Abhyankar t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 152032121132SShri Abhyankar t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 152132121132SShri Abhyankar v -= bs2; 152232121132SShri Abhyankar } 152332121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 152432121132SShri Abhyankar idx += bs; 152532121132SShri Abhyankar } 152632121132SShri Abhyankar /* backward solve the L^T */ 152732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 152832121132SShri Abhyankar v = aa + bs2*ai[i]; 152932121132SShri Abhyankar vi = aj + ai[i]; 153032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 153132121132SShri Abhyankar idt = bs*i; 153232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 153332121132SShri Abhyankar for(j=0;j<nz;j++){ 153432121132SShri Abhyankar idx = bs*vi[j]; 153532121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 153632121132SShri Abhyankar t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 153732121132SShri Abhyankar t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 153832121132SShri Abhyankar t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 153932121132SShri Abhyankar v += bs2; 154032121132SShri Abhyankar } 154132121132SShri Abhyankar } 154232121132SShri Abhyankar 154332121132SShri Abhyankar /* copy t into x according to permutation */ 154432121132SShri Abhyankar for(i=0;i<n;i++){ 154532121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 154632121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 154732121132SShri Abhyankar } 154832121132SShri Abhyankar 154932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 155032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1551*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 155232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 155332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 155432121132SShri Abhyankar PetscFunctionReturn(0); 155532121132SShri Abhyankar } 155632121132SShri Abhyankar 155732121132SShri Abhyankar #undef __FUNCT__ 155806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 155906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1560f1af5d2fSBarry Smith { 1561f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1562f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 15636849ba73SBarry Smith PetscErrorCode ierr; 15645d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1565b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1566b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1567b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1568b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1569b3260449SShri Abhyankar const PetscScalar *b; 1570f1af5d2fSBarry Smith 1571f1af5d2fSBarry Smith PetscFunctionBegin; 1572*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 15731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1574f1af5d2fSBarry Smith t = a->solve_work; 1575f1af5d2fSBarry Smith 1576f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1577f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1578f1af5d2fSBarry Smith 1579f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1580f1af5d2fSBarry Smith ii = 0; 1581f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1582f1af5d2fSBarry Smith ic = 5*c[i]; 1583f1af5d2fSBarry Smith t[ii] = b[ic]; 1584f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1585f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1586f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1587f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1588f1af5d2fSBarry Smith ii += 5; 1589f1af5d2fSBarry Smith } 1590f1af5d2fSBarry Smith 1591f1af5d2fSBarry Smith /* forward solve the U^T */ 1592f1af5d2fSBarry Smith idx = 0; 1593f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1594f1af5d2fSBarry Smith 1595f1af5d2fSBarry Smith v = aa + 25*diag[i]; 1596f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1597f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1598f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1599f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1600f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1601f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1602f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1603f1af5d2fSBarry Smith v += 25; 1604f1af5d2fSBarry Smith 1605f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1606f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1607f1af5d2fSBarry Smith while (nz--) { 1608f1af5d2fSBarry Smith oidx = 5*(*vi++); 1609f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1610f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1611f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1612f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1613f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1614f1af5d2fSBarry Smith v += 25; 1615f1af5d2fSBarry Smith } 1616f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1617f1af5d2fSBarry Smith idx += 5; 1618f1af5d2fSBarry Smith } 1619f1af5d2fSBarry Smith /* backward solve the L^T */ 1620f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1621f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 1622f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1623f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1624f1af5d2fSBarry Smith idt = 5*i; 1625f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1626f1af5d2fSBarry Smith while (nz--) { 1627f1af5d2fSBarry Smith idx = 5*(*vi--); 1628f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1629f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1630f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1631f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1632f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1633f1af5d2fSBarry Smith v -= 25; 1634f1af5d2fSBarry Smith } 1635f1af5d2fSBarry Smith } 1636f1af5d2fSBarry Smith 1637f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1638f1af5d2fSBarry Smith ii = 0; 1639f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1640f1af5d2fSBarry Smith ir = 5*r[i]; 1641f1af5d2fSBarry Smith x[ir] = t[ii]; 1642f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1643f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1644f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1645f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1646f1af5d2fSBarry Smith ii += 5; 1647f1af5d2fSBarry Smith } 1648f1af5d2fSBarry Smith 1649f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1650f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1651*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 16521ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1653dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1654f1af5d2fSBarry Smith PetscFunctionReturn(0); 1655f1af5d2fSBarry Smith } 1656f1af5d2fSBarry Smith 16574a2ae208SSatish Balay #undef __FUNCT__ 16584dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 16594dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 166032121132SShri Abhyankar { 166132121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 166232121132SShri Abhyankar PetscErrorCode ierr; 166332121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1664b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 166532121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 166632121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1667b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1668b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1669b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1670b3260449SShri Abhyankar const PetscScalar *b; 167132121132SShri Abhyankar 167232121132SShri Abhyankar PetscFunctionBegin; 1673*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 167432121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 167532121132SShri Abhyankar t = a->solve_work; 167632121132SShri Abhyankar 167732121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 167832121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 167932121132SShri Abhyankar 168032121132SShri Abhyankar /* copy b into temp work space according to permutation */ 168132121132SShri Abhyankar for(i=0;i<n;i++){ 168232121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 168332121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 168432121132SShri Abhyankar t[ii+4] = b[ic+4]; 168532121132SShri Abhyankar } 168632121132SShri Abhyankar 168732121132SShri Abhyankar /* forward solve the U^T */ 168832121132SShri Abhyankar idx = 0; 168932121132SShri Abhyankar for (i=0; i<n; i++) { 169032121132SShri Abhyankar v = aa + bs2*diag[i]; 169132121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 169232121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 169332121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 169432121132SShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 169532121132SShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 169632121132SShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 169732121132SShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 169832121132SShri Abhyankar v -= bs2; 169932121132SShri Abhyankar 170032121132SShri Abhyankar vi = aj + diag[i] - 1; 170132121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 170232121132SShri Abhyankar for(j=0;j>-nz;j--){ 170332121132SShri Abhyankar oidx = bs*vi[j]; 170432121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 170532121132SShri Abhyankar t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 170632121132SShri Abhyankar t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 170732121132SShri Abhyankar t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 170832121132SShri Abhyankar t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 170932121132SShri Abhyankar v -= bs2; 171032121132SShri Abhyankar } 171132121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 171232121132SShri Abhyankar idx += bs; 171332121132SShri Abhyankar } 171432121132SShri Abhyankar /* backward solve the L^T */ 171532121132SShri Abhyankar for (i=n-1; i>=0; i--){ 171632121132SShri Abhyankar v = aa + bs2*ai[i]; 171732121132SShri Abhyankar vi = aj + ai[i]; 171832121132SShri Abhyankar nz = ai[i+1] - ai[i]; 171932121132SShri Abhyankar idt = bs*i; 172032121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 172132121132SShri Abhyankar for(j=0;j<nz;j++){ 172232121132SShri Abhyankar idx = bs*vi[j]; 172332121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 172432121132SShri Abhyankar t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 172532121132SShri Abhyankar t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 172632121132SShri Abhyankar t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 172732121132SShri Abhyankar t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 172832121132SShri Abhyankar v += bs2; 172932121132SShri Abhyankar } 173032121132SShri Abhyankar } 173132121132SShri Abhyankar 173232121132SShri Abhyankar /* copy t into x according to permutation */ 173332121132SShri Abhyankar for(i=0;i<n;i++){ 173432121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 173532121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 173632121132SShri Abhyankar x[ir+4] = t[ii+4]; 173732121132SShri Abhyankar } 173832121132SShri Abhyankar 173932121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 174032121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1741*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 174232121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 174332121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 174432121132SShri Abhyankar PetscFunctionReturn(0); 174532121132SShri Abhyankar } 174632121132SShri Abhyankar 174732121132SShri Abhyankar #undef __FUNCT__ 174806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 174906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1750f1af5d2fSBarry Smith { 1751f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1752f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 17536849ba73SBarry Smith PetscErrorCode ierr; 17545d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1755b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1756b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1757b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1758b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1759b3260449SShri Abhyankar const PetscScalar *b; 1760f1af5d2fSBarry Smith 1761f1af5d2fSBarry Smith PetscFunctionBegin; 1762*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 17631ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1764f1af5d2fSBarry Smith t = a->solve_work; 1765f1af5d2fSBarry Smith 1766f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1767f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1768f1af5d2fSBarry Smith 1769f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1770f1af5d2fSBarry Smith ii = 0; 1771f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1772f1af5d2fSBarry Smith ic = 6*c[i]; 1773f1af5d2fSBarry Smith t[ii] = b[ic]; 1774f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1775f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1776f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1777f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1778f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1779f1af5d2fSBarry Smith ii += 6; 1780f1af5d2fSBarry Smith } 1781f1af5d2fSBarry Smith 1782f1af5d2fSBarry Smith /* forward solve the U^T */ 1783f1af5d2fSBarry Smith idx = 0; 1784f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1785f1af5d2fSBarry Smith 1786f1af5d2fSBarry Smith v = aa + 36*diag[i]; 1787f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1788f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1789f1af5d2fSBarry Smith x6 = t[5+idx]; 1790f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1791f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1792f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1793f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1794f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1795f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1796f1af5d2fSBarry Smith v += 36; 1797f1af5d2fSBarry Smith 1798f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1799f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1800f1af5d2fSBarry Smith while (nz--) { 1801f1af5d2fSBarry Smith oidx = 6*(*vi++); 1802f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1803f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1804f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1805f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1806f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1807f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1808f1af5d2fSBarry Smith v += 36; 1809f1af5d2fSBarry Smith } 1810f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1811f1af5d2fSBarry Smith t[5+idx] = s6; 1812f1af5d2fSBarry Smith idx += 6; 1813f1af5d2fSBarry Smith } 1814f1af5d2fSBarry Smith /* backward solve the L^T */ 1815f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1816f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 1817f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1818f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1819f1af5d2fSBarry Smith idt = 6*i; 1820f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1821f1af5d2fSBarry Smith s6 = t[5+idt]; 1822f1af5d2fSBarry Smith while (nz--) { 1823f1af5d2fSBarry Smith idx = 6*(*vi--); 1824f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1825f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1826f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1827f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1828f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1829f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1830f1af5d2fSBarry Smith v -= 36; 1831f1af5d2fSBarry Smith } 1832f1af5d2fSBarry Smith } 1833f1af5d2fSBarry Smith 1834f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1835f1af5d2fSBarry Smith ii = 0; 1836f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1837f1af5d2fSBarry Smith ir = 6*r[i]; 1838f1af5d2fSBarry Smith x[ir] = t[ii]; 1839f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1840f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1841f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1842f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1843f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1844f1af5d2fSBarry Smith ii += 6; 1845f1af5d2fSBarry Smith } 1846f1af5d2fSBarry Smith 1847f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1848f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1849*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 18501ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1851dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1852f1af5d2fSBarry Smith PetscFunctionReturn(0); 1853f1af5d2fSBarry Smith } 1854f1af5d2fSBarry Smith 18554a2ae208SSatish Balay #undef __FUNCT__ 18564dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 18574dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 185832121132SShri Abhyankar { 185932121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 186032121132SShri Abhyankar PetscErrorCode ierr; 186132121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1862b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 186332121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 186432121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1865b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1866b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1867b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1868b3260449SShri Abhyankar const PetscScalar *b; 186932121132SShri Abhyankar 187032121132SShri Abhyankar PetscFunctionBegin; 1871*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 187232121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 187332121132SShri Abhyankar t = a->solve_work; 187432121132SShri Abhyankar 187532121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 187632121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 187732121132SShri Abhyankar 187832121132SShri Abhyankar /* copy b into temp work space according to permutation */ 187932121132SShri Abhyankar for(i=0;i<n;i++){ 188032121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 188132121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 188232121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 188332121132SShri Abhyankar } 188432121132SShri Abhyankar 188532121132SShri Abhyankar /* forward solve the U^T */ 188632121132SShri Abhyankar idx = 0; 188732121132SShri Abhyankar for (i=0; i<n; i++) { 188832121132SShri Abhyankar v = aa + bs2*diag[i]; 188932121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 189032121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 189132121132SShri Abhyankar x6 = t[5+idx]; 189232121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 189332121132SShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 189432121132SShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 189532121132SShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 189632121132SShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 189732121132SShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 189832121132SShri Abhyankar v -= bs2; 189932121132SShri Abhyankar 190032121132SShri Abhyankar vi = aj + diag[i] - 1; 190132121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 190232121132SShri Abhyankar for(j=0;j>-nz;j--){ 190332121132SShri Abhyankar oidx = bs*vi[j]; 190432121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 190532121132SShri Abhyankar t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 190632121132SShri Abhyankar t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 190732121132SShri Abhyankar t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 190832121132SShri Abhyankar t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 190932121132SShri Abhyankar t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 191032121132SShri Abhyankar v -= bs2; 191132121132SShri Abhyankar } 191232121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 191332121132SShri Abhyankar t[5+idx] = s6; 191432121132SShri Abhyankar idx += bs; 191532121132SShri Abhyankar } 191632121132SShri Abhyankar /* backward solve the L^T */ 191732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 191832121132SShri Abhyankar v = aa + bs2*ai[i]; 191932121132SShri Abhyankar vi = aj + ai[i]; 192032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 192132121132SShri Abhyankar idt = bs*i; 192232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 192332121132SShri Abhyankar s6 = t[5+idt]; 192432121132SShri Abhyankar for(j=0;j<nz;j++){ 192532121132SShri Abhyankar idx = bs*vi[j]; 192632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 192732121132SShri Abhyankar t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 192832121132SShri Abhyankar t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 192932121132SShri Abhyankar t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 193032121132SShri Abhyankar t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 193132121132SShri Abhyankar t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 193232121132SShri Abhyankar v += bs2; 193332121132SShri Abhyankar } 193432121132SShri Abhyankar } 193532121132SShri Abhyankar 193632121132SShri Abhyankar /* copy t into x according to permutation */ 193732121132SShri Abhyankar for(i=0;i<n;i++){ 193832121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 193932121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 194032121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 194132121132SShri Abhyankar } 194232121132SShri Abhyankar 194332121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 194432121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1945*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 194632121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 194732121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 194832121132SShri Abhyankar PetscFunctionReturn(0); 194932121132SShri Abhyankar } 195032121132SShri Abhyankar 195132121132SShri Abhyankar #undef __FUNCT__ 195206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 195306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1954f1af5d2fSBarry Smith { 1955f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1956f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 19576849ba73SBarry Smith PetscErrorCode ierr; 19585d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1959b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1960b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1961b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1962b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1963b3260449SShri Abhyankar const PetscScalar *b; 1964f1af5d2fSBarry Smith 1965f1af5d2fSBarry Smith PetscFunctionBegin; 1966*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 19671ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1968f1af5d2fSBarry Smith t = a->solve_work; 1969f1af5d2fSBarry Smith 1970f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1971f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1972f1af5d2fSBarry Smith 1973f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1974f1af5d2fSBarry Smith ii = 0; 1975f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1976f1af5d2fSBarry Smith ic = 7*c[i]; 1977f1af5d2fSBarry Smith t[ii] = b[ic]; 1978f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1979f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1980f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1981f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1982f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1983f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1984f1af5d2fSBarry Smith ii += 7; 1985f1af5d2fSBarry Smith } 1986f1af5d2fSBarry Smith 1987f1af5d2fSBarry Smith /* forward solve the U^T */ 1988f1af5d2fSBarry Smith idx = 0; 1989f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1990f1af5d2fSBarry Smith 1991f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1992f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1993f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1994f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1995f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1996f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1997f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1998f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1999f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 2000f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 2001f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 2002f1af5d2fSBarry Smith v += 49; 2003f1af5d2fSBarry Smith 2004f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 2005f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 2006f1af5d2fSBarry Smith while (nz--) { 2007f1af5d2fSBarry Smith oidx = 7*(*vi++); 2008f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2009f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2010f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2011f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2012f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2013f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2014f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2015f1af5d2fSBarry Smith v += 49; 2016f1af5d2fSBarry Smith } 2017f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2018f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 2019f1af5d2fSBarry Smith idx += 7; 2020f1af5d2fSBarry Smith } 2021f1af5d2fSBarry Smith /* backward solve the L^T */ 2022f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 2023f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 2024f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 2025f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 2026f1af5d2fSBarry Smith idt = 7*i; 2027f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2028f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 2029f1af5d2fSBarry Smith while (nz--) { 2030f1af5d2fSBarry Smith idx = 7*(*vi--); 2031f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2032f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2033f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2034f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2035f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2036f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2037f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2038f1af5d2fSBarry Smith v -= 49; 2039f1af5d2fSBarry Smith } 2040f1af5d2fSBarry Smith } 2041f1af5d2fSBarry Smith 2042f1af5d2fSBarry Smith /* copy t into x according to permutation */ 2043f1af5d2fSBarry Smith ii = 0; 2044f1af5d2fSBarry Smith for (i=0; i<n; i++) { 2045f1af5d2fSBarry Smith ir = 7*r[i]; 2046f1af5d2fSBarry Smith x[ir] = t[ii]; 2047f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 2048f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 2049f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 2050f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 2051f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 2052f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 2053f1af5d2fSBarry Smith ii += 7; 2054f1af5d2fSBarry Smith } 2055f1af5d2fSBarry Smith 2056f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2057f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2058*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 20591ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2060dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2061f1af5d2fSBarry Smith PetscFunctionReturn(0); 2062f1af5d2fSBarry Smith } 206332121132SShri Abhyankar #undef __FUNCT__ 20644dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 20654dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 206632121132SShri Abhyankar { 206732121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 206832121132SShri Abhyankar PetscErrorCode ierr; 206932121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 2070b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 207132121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 207232121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 2073b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2074b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2075b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2076b3260449SShri Abhyankar const PetscScalar *b; 207732121132SShri Abhyankar 207832121132SShri Abhyankar PetscFunctionBegin; 2079*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 208032121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 208132121132SShri Abhyankar t = a->solve_work; 208232121132SShri Abhyankar 208332121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 208432121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 208532121132SShri Abhyankar 208632121132SShri Abhyankar /* copy b into temp work space according to permutation */ 208732121132SShri Abhyankar for(i=0;i<n;i++){ 208832121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 208932121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 209032121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 209132121132SShri Abhyankar } 209232121132SShri Abhyankar 209332121132SShri Abhyankar /* forward solve the U^T */ 209432121132SShri Abhyankar idx = 0; 209532121132SShri Abhyankar for (i=0; i<n; i++) { 209632121132SShri Abhyankar v = aa + bs2*diag[i]; 209732121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 209832121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 209932121132SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 210032121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 210132121132SShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 210232121132SShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 210332121132SShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 210432121132SShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 210532121132SShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 210632121132SShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 210732121132SShri Abhyankar v -= bs2; 210832121132SShri Abhyankar 210932121132SShri Abhyankar vi = aj + diag[i] - 1; 211032121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 211132121132SShri Abhyankar for(j=0;j>-nz;j--){ 211232121132SShri Abhyankar oidx = bs*vi[j]; 211332121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 211432121132SShri Abhyankar t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 211532121132SShri Abhyankar t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 211632121132SShri Abhyankar t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 211732121132SShri Abhyankar t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 211832121132SShri Abhyankar t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 211932121132SShri Abhyankar t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 212032121132SShri Abhyankar v -= bs2; 212132121132SShri Abhyankar } 212232121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 212332121132SShri Abhyankar t[5+idx] = s6; t[6+idx] = s7; 212432121132SShri Abhyankar idx += bs; 212532121132SShri Abhyankar } 212632121132SShri Abhyankar /* backward solve the L^T */ 212732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 212832121132SShri Abhyankar v = aa + bs2*ai[i]; 212932121132SShri Abhyankar vi = aj + ai[i]; 213032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 213132121132SShri Abhyankar idt = bs*i; 213232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 213332121132SShri Abhyankar s6 = t[5+idt]; s7 = t[6+idt]; 213432121132SShri Abhyankar for(j=0;j<nz;j++){ 213532121132SShri Abhyankar idx = bs*vi[j]; 213632121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 213732121132SShri Abhyankar t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 213832121132SShri Abhyankar t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 213932121132SShri Abhyankar t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 214032121132SShri Abhyankar t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 214132121132SShri Abhyankar t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 214232121132SShri Abhyankar t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 214332121132SShri Abhyankar v += bs2; 214432121132SShri Abhyankar } 214532121132SShri Abhyankar } 214632121132SShri Abhyankar 214732121132SShri Abhyankar /* copy t into x according to permutation */ 214832121132SShri Abhyankar for(i=0;i<n;i++){ 214932121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 215032121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 215132121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 215232121132SShri Abhyankar } 215332121132SShri Abhyankar 215432121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 215532121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2156*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 215732121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 215832121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 215932121132SShri Abhyankar PetscFunctionReturn(0); 216032121132SShri Abhyankar } 2161f1af5d2fSBarry Smith 21624e2b4712SSatish Balay /* ----------------------------------------------------------- */ 21634a2ae208SSatish Balay #undef __FUNCT__ 216406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 216506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 21664e2b4712SSatish Balay { 21674e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21684e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 21696849ba73SBarry Smith PetscErrorCode ierr; 2170b3260449SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 2171b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 2172b3260449SShri Abhyankar PetscInt i,nz; 2173b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2174b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2175b3260449SShri Abhyankar PetscScalar *x,*s,*t,*ls; 2176b3260449SShri Abhyankar const PetscScalar *b; 21774e2b4712SSatish Balay 21784e2b4712SSatish Balay PetscFunctionBegin; 2179*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 21801ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2181f1af5d2fSBarry Smith t = a->solve_work; 21824e2b4712SSatish Balay 21834e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 21844e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 21854e2b4712SSatish Balay 21864e2b4712SSatish Balay /* forward solve the lower triangular */ 218787828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 21884e2b4712SSatish Balay for (i=1; i<n; i++) { 21894e2b4712SSatish Balay v = aa + bs2*ai[i]; 21904e2b4712SSatish Balay vi = aj + ai[i]; 21914e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 2192f1af5d2fSBarry Smith s = t + bs*i; 219387828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 21944e2b4712SSatish Balay while (nz--) { 2195f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 21964e2b4712SSatish Balay v += bs2; 21974e2b4712SSatish Balay } 21984e2b4712SSatish Balay } 21994e2b4712SSatish Balay /* backward solve the upper triangular */ 2200d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 22014e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 22024e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 22034e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 22044e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 220587828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 22064e2b4712SSatish Balay while (nz--) { 2207f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 22084e2b4712SSatish Balay v += bs2; 22094e2b4712SSatish Balay } 2210f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 221187828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 22124e2b4712SSatish Balay } 22134e2b4712SSatish Balay 22144e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22154e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2216*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 22171ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2218dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 22194e2b4712SSatish Balay PetscFunctionReturn(0); 22204e2b4712SSatish Balay } 22214e2b4712SSatish Balay 22225c42ef9dSBarry Smith /* ----------------------------------------------------------- */ 22235c42ef9dSBarry Smith #undef __FUNCT__ 222406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 222506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 22265c42ef9dSBarry Smith { 22275c42ef9dSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 22285c42ef9dSBarry Smith IS iscol=a->col,isrow=a->row; 22295c42ef9dSBarry Smith PetscErrorCode ierr; 22305c42ef9dSBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2231b3260449SShri Abhyankar PetscInt i,nz,j; 2232b3260449SShri Abhyankar const PetscInt n=a->mbs,bs=A->rmap->bs,bs2=a->bs2; 22335c42ef9dSBarry Smith const MatScalar *aa=a->a,*v; 22345c42ef9dSBarry Smith PetscScalar *x,*t,*ls; 22355c42ef9dSBarry Smith const PetscScalar *b; 22365c42ef9dSBarry Smith PetscFunctionBegin; 2237*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 22385c42ef9dSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22395c42ef9dSBarry Smith t = a->solve_work; 22405c42ef9dSBarry Smith 22415c42ef9dSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 22425c42ef9dSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 22435c42ef9dSBarry Smith 22445c42ef9dSBarry Smith /* copy the b into temp work space according to permutation */ 22455c42ef9dSBarry Smith for (i=0; i<n; i++) { 22465c42ef9dSBarry Smith for (j=0; j<bs; j++) { 22475c42ef9dSBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 22485c42ef9dSBarry Smith } 22495c42ef9dSBarry Smith } 22505c42ef9dSBarry Smith 22515c42ef9dSBarry Smith 22525c42ef9dSBarry Smith /* forward solve the upper triangular transpose */ 22535c42ef9dSBarry Smith ls = a->solve_work + A->cmap->n; 22545c42ef9dSBarry Smith for (i=0; i<n; i++){ 22555c42ef9dSBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 22565c42ef9dSBarry Smith Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 22575c42ef9dSBarry Smith v = aa + bs2*(a->diag[i] + 1); 22585c42ef9dSBarry Smith vi = aj + a->diag[i] + 1; 22595c42ef9dSBarry Smith nz = ai[i+1] - a->diag[i] - 1; 22605c42ef9dSBarry Smith while (nz--) { 22615c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 22625c42ef9dSBarry Smith v += bs2; 22635c42ef9dSBarry Smith } 22645c42ef9dSBarry Smith } 22655c42ef9dSBarry Smith 22665c42ef9dSBarry Smith /* backward solve the lower triangular transpose */ 22675c42ef9dSBarry Smith for (i=n-1; i>=0; i--) { 22685c42ef9dSBarry Smith v = aa + bs2*ai[i]; 22695c42ef9dSBarry Smith vi = aj + ai[i]; 22705c42ef9dSBarry Smith nz = a->diag[i] - ai[i]; 22715c42ef9dSBarry Smith while (nz--) { 22725c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 22735c42ef9dSBarry Smith v += bs2; 22745c42ef9dSBarry Smith } 22755c42ef9dSBarry Smith } 22765c42ef9dSBarry Smith 22775c42ef9dSBarry Smith /* copy t into x according to permutation */ 22785c42ef9dSBarry Smith for (i=0; i<n; i++) { 22795c42ef9dSBarry Smith for (j=0; j<bs; j++) { 22805c42ef9dSBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 22815c42ef9dSBarry Smith } 22825c42ef9dSBarry Smith } 22835c42ef9dSBarry Smith 22845c42ef9dSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22855c42ef9dSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2286*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 22875c42ef9dSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 22885c42ef9dSBarry Smith ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 22895c42ef9dSBarry Smith PetscFunctionReturn(0); 22905c42ef9dSBarry Smith } 22915c42ef9dSBarry Smith 22924a2ae208SSatish Balay #undef __FUNCT__ 22934dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 22944dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 22958499736aSShri Abhyankar { 22968499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 22978499736aSShri Abhyankar IS iscol=a->col,isrow=a->row; 22988499736aSShri Abhyankar PetscErrorCode ierr; 2299b3260449SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 2300b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2301b3260449SShri Abhyankar PetscInt i,j,nz; 2302b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 23038499736aSShri Abhyankar const MatScalar *aa=a->a,*v; 23048499736aSShri Abhyankar PetscScalar *x,*t,*ls; 23058499736aSShri Abhyankar const PetscScalar *b; 2306b3260449SShri Abhyankar 23078499736aSShri Abhyankar PetscFunctionBegin; 2308*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 23098499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23108499736aSShri Abhyankar t = a->solve_work; 23118499736aSShri Abhyankar 23128499736aSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 23138499736aSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 23148499736aSShri Abhyankar 23158499736aSShri Abhyankar /* copy the b into temp work space according to permutation */ 23168499736aSShri Abhyankar for (i=0; i<n; i++) { 23178499736aSShri Abhyankar for (j=0; j<bs; j++) { 23188499736aSShri Abhyankar t[i*bs+j] = b[c[i]*bs+j]; 23198499736aSShri Abhyankar } 23208499736aSShri Abhyankar } 23218499736aSShri Abhyankar 23228499736aSShri Abhyankar 23238499736aSShri Abhyankar /* forward solve the upper triangular transpose */ 23248499736aSShri Abhyankar ls = a->solve_work + A->cmap->n; 23258499736aSShri Abhyankar for (i=0; i<n; i++){ 23268499736aSShri Abhyankar ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 23278499736aSShri Abhyankar Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 23288499736aSShri Abhyankar v = aa + bs2*(diag[i] - 1); 23298499736aSShri Abhyankar vi = aj + diag[i] - 1; 23308499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 23318499736aSShri Abhyankar for(j=0;j>-nz;j--){ 23328499736aSShri Abhyankar Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 23338499736aSShri Abhyankar v -= bs2; 23348499736aSShri Abhyankar } 23358499736aSShri Abhyankar } 23368499736aSShri Abhyankar 23378499736aSShri Abhyankar /* backward solve the lower triangular transpose */ 23388499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 23398499736aSShri Abhyankar v = aa + bs2*ai[i]; 23408499736aSShri Abhyankar vi = aj + ai[i]; 23418499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 23428499736aSShri Abhyankar for(j=0;j<nz;j++){ 23438499736aSShri Abhyankar Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 23448499736aSShri Abhyankar v += bs2; 23458499736aSShri Abhyankar } 23468499736aSShri Abhyankar } 23478499736aSShri Abhyankar 23488499736aSShri Abhyankar /* copy t into x according to permutation */ 23498499736aSShri Abhyankar for (i=0; i<n; i++) { 23508499736aSShri Abhyankar for (j=0; j<bs; j++) { 23518499736aSShri Abhyankar x[bs*r[i]+j] = t[bs*i+j]; 23528499736aSShri Abhyankar } 23538499736aSShri Abhyankar } 23548499736aSShri Abhyankar 23558499736aSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 23568499736aSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2357*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 23588499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 23598499736aSShri Abhyankar ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 23608499736aSShri Abhyankar PetscFunctionReturn(0); 23618499736aSShri Abhyankar } 23628499736aSShri Abhyankar 2363832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */ 236429a97285SShri Abhyankar 23652b0b2ea7SShri Abhyankar #undef __FUNCT__ 2366832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2" 2367832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx) 23682b0b2ea7SShri Abhyankar { 23692b0b2ea7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 23702b0b2ea7SShri Abhyankar PetscErrorCode ierr; 2371b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 23720fa040f9SShri Abhyankar PetscInt i,nz,idx,idt,m; 23730b68f018SBarry Smith const MatScalar *aa=a->a,*v; 23742b0b2ea7SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 23752b0b2ea7SShri Abhyankar PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 23760fa040f9SShri Abhyankar PetscScalar *x; 23770b68f018SBarry Smith const PetscScalar *b; 23782b0b2ea7SShri Abhyankar 23792b0b2ea7SShri Abhyankar PetscFunctionBegin; 2380*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 23812b0b2ea7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23822b0b2ea7SShri Abhyankar 23832b0b2ea7SShri Abhyankar /* forward solve the lower triangular */ 238429a97285SShri Abhyankar idx = 0; 23850fa040f9SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx]; 23860fa040f9SShri Abhyankar x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx]; 23870fa040f9SShri Abhyankar x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx]; 23882b0b2ea7SShri Abhyankar 23892b0b2ea7SShri Abhyankar for (i=1; i<n; i++) { 23902b0b2ea7SShri Abhyankar v = aa + bs2*ai[i]; 23912b0b2ea7SShri Abhyankar vi = aj + ai[i]; 23922b0b2ea7SShri Abhyankar nz = ai[i+1] - ai[i]; 23930fa040f9SShri Abhyankar idt = bs*i; 23940fa040f9SShri Abhyankar s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt]; 23950fa040f9SShri Abhyankar s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt]; 23960fa040f9SShri Abhyankar s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt]; 23972b0b2ea7SShri Abhyankar for(m=0;m<nz;m++){ 23982b0b2ea7SShri Abhyankar idx = bs*vi[m]; 23990fa040f9SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 24000fa040f9SShri Abhyankar x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 24010fa040f9SShri Abhyankar x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 24022b0b2ea7SShri Abhyankar 24030b8f6341SShri Abhyankar 24042b0b2ea7SShri Abhyankar s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 24052b0b2ea7SShri Abhyankar s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 24062b0b2ea7SShri Abhyankar s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 24072b0b2ea7SShri Abhyankar s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 24082b0b2ea7SShri Abhyankar s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 24092b0b2ea7SShri Abhyankar s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 24102b0b2ea7SShri Abhyankar s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 24112b0b2ea7SShri Abhyankar s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 24122b0b2ea7SShri Abhyankar s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 24132b0b2ea7SShri Abhyankar s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 24142b0b2ea7SShri Abhyankar s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 24152b0b2ea7SShri Abhyankar s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 24162b0b2ea7SShri Abhyankar s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 24172b0b2ea7SShri Abhyankar s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 24182b0b2ea7SShri Abhyankar s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 24192b0b2ea7SShri Abhyankar 24202b0b2ea7SShri Abhyankar v += bs2; 24212b0b2ea7SShri Abhyankar } 24220fa040f9SShri Abhyankar x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5; 24230fa040f9SShri Abhyankar x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10; 24240fa040f9SShri Abhyankar x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15; 24252b0b2ea7SShri Abhyankar 24262b0b2ea7SShri Abhyankar } 24272b0b2ea7SShri Abhyankar /* backward solve the upper triangular */ 24282b0b2ea7SShri Abhyankar for (i=n-1; i>=0; i--){ 24292b0b2ea7SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 24302b0b2ea7SShri Abhyankar vi = aj + adiag[i+1]+1; 24312b0b2ea7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 24322b0b2ea7SShri Abhyankar idt = bs*i; 24330fa040f9SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 24340fa040f9SShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt]; 24350fa040f9SShri Abhyankar s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt]; 24362b0b2ea7SShri Abhyankar 24372b0b2ea7SShri Abhyankar for(m=0;m<nz;m++){ 24382b0b2ea7SShri Abhyankar idx = bs*vi[m]; 24390fa040f9SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 24400fa040f9SShri Abhyankar x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 24410fa040f9SShri Abhyankar x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 24422b0b2ea7SShri Abhyankar 24432b0b2ea7SShri Abhyankar s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 24442b0b2ea7SShri Abhyankar s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 24452b0b2ea7SShri Abhyankar s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 24462b0b2ea7SShri Abhyankar s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 24472b0b2ea7SShri Abhyankar s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 24482b0b2ea7SShri Abhyankar s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 24492b0b2ea7SShri Abhyankar s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 24502b0b2ea7SShri Abhyankar s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 24512b0b2ea7SShri Abhyankar s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 24522b0b2ea7SShri Abhyankar s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 24532b0b2ea7SShri Abhyankar s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 24542b0b2ea7SShri Abhyankar s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 24552b0b2ea7SShri Abhyankar s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 24562b0b2ea7SShri Abhyankar s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 24572b0b2ea7SShri Abhyankar s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 24582b0b2ea7SShri Abhyankar 24592b0b2ea7SShri Abhyankar v += bs2; 24602b0b2ea7SShri Abhyankar } 24612b0b2ea7SShri Abhyankar 24620fa040f9SShri Abhyankar x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 24630fa040f9SShri Abhyankar x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 24640fa040f9SShri Abhyankar x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 24650fa040f9SShri Abhyankar x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 24660fa040f9SShri Abhyankar x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 24670fa040f9SShri Abhyankar x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 24680fa040f9SShri Abhyankar x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 24690fa040f9SShri Abhyankar x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 24700fa040f9SShri Abhyankar x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 24710fa040f9SShri Abhyankar x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 24720fa040f9SShri Abhyankar x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 24730fa040f9SShri Abhyankar x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 24740fa040f9SShri Abhyankar x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 24750fa040f9SShri Abhyankar x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 24760fa040f9SShri Abhyankar x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 24772b0b2ea7SShri Abhyankar 24782b0b2ea7SShri Abhyankar } 24792b0b2ea7SShri Abhyankar 2480*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 24812b0b2ea7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 24822b0b2ea7SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 24832b0b2ea7SShri Abhyankar PetscFunctionReturn(0); 24842b0b2ea7SShri Abhyankar } 24852b0b2ea7SShri Abhyankar 2486832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */ 2487832cc040SShri Abhyankar /* Default MatSolve for block size 15 */ 2488832cc040SShri Abhyankar 24898499736aSShri Abhyankar #undef __FUNCT__ 2490832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1" 2491832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx) 24920b8f6341SShri Abhyankar { 24930b8f6341SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 24940b8f6341SShri Abhyankar PetscErrorCode ierr; 24950b8f6341SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 24960fa040f9SShri Abhyankar PetscInt i,k,nz,kdx,idx,idt,m; 24970b8f6341SShri Abhyankar const MatScalar *aa=a->a,*v; 24980b8f6341SShri Abhyankar PetscScalar s[15]; 24990fa040f9SShri Abhyankar PetscScalar *x; 25000b8f6341SShri Abhyankar const PetscScalar *b; 25010b8f6341SShri Abhyankar 25020b8f6341SShri Abhyankar PetscFunctionBegin; 2503*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 25040b8f6341SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25050b8f6341SShri Abhyankar 25060b8f6341SShri Abhyankar /* forward solve the lower triangular */ 2507832cc040SShri Abhyankar for (i=0; i<n; i++) { 25080b8f6341SShri Abhyankar v = aa + bs2*ai[i]; 25090b8f6341SShri Abhyankar vi = aj + ai[i]; 25100b8f6341SShri Abhyankar nz = ai[i+1] - ai[i]; 25110fa040f9SShri Abhyankar idt = bs*i; 2512832cc040SShri Abhyankar x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 2513832cc040SShri Abhyankar x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 2514832cc040SShri Abhyankar x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt]; 25150b8f6341SShri Abhyankar for(m=0;m<nz;m++){ 25160b8f6341SShri Abhyankar idx = bs*vi[m]; 25170b8f6341SShri Abhyankar for(k=0;k<15;k++){ 25180fa040f9SShri Abhyankar kdx = k + idx; 2519832cc040SShri Abhyankar x[idt] -= v[0]*x[kdx]; 2520832cc040SShri Abhyankar x[1+idt] -= v[1]*x[kdx]; 2521832cc040SShri Abhyankar x[2+idt] -= v[2]*x[kdx]; 2522832cc040SShri Abhyankar x[3+idt] -= v[3]*x[kdx]; 2523832cc040SShri Abhyankar x[4+idt] -= v[4]*x[kdx]; 2524832cc040SShri Abhyankar x[5+idt] -= v[5]*x[kdx]; 2525832cc040SShri Abhyankar x[6+idt] -= v[6]*x[kdx]; 2526832cc040SShri Abhyankar x[7+idt] -= v[7]*x[kdx]; 2527832cc040SShri Abhyankar x[8+idt] -= v[8]*x[kdx]; 2528832cc040SShri Abhyankar x[9+idt] -= v[9]*x[kdx]; 2529832cc040SShri Abhyankar x[10+idt] -= v[10]*x[kdx]; 2530832cc040SShri Abhyankar x[11+idt] -= v[11]*x[kdx]; 2531832cc040SShri Abhyankar x[12+idt] -= v[12]*x[kdx]; 2532832cc040SShri Abhyankar x[13+idt] -= v[13]*x[kdx]; 2533832cc040SShri Abhyankar x[14+idt] -= v[14]*x[kdx]; 25340b8f6341SShri Abhyankar v += 15; 25350b8f6341SShri Abhyankar } 25360b8f6341SShri Abhyankar } 25370b8f6341SShri Abhyankar } 25380b8f6341SShri Abhyankar /* backward solve the upper triangular */ 25390b8f6341SShri Abhyankar for (i=n-1; i>=0; i--){ 25400b8f6341SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 25410b8f6341SShri Abhyankar vi = aj + adiag[i+1]+1; 25420b8f6341SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 25430b8f6341SShri Abhyankar idt = bs*i; 25440fa040f9SShri Abhyankar s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 25450fa040f9SShri Abhyankar s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 25460fa040f9SShri Abhyankar s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt]; 25470b8f6341SShri Abhyankar 25480b8f6341SShri Abhyankar for(m=0;m<nz;m++){ 25490b8f6341SShri Abhyankar idx = bs*vi[m]; 25500b8f6341SShri Abhyankar for(k=0;k<15;k++){ 25510fa040f9SShri Abhyankar kdx = k + idx; 25520fa040f9SShri Abhyankar s[0] -= v[0]*x[kdx]; 25530fa040f9SShri Abhyankar s[1] -= v[1]*x[kdx]; 25540fa040f9SShri Abhyankar s[2] -= v[2]*x[kdx]; 25550fa040f9SShri Abhyankar s[3] -= v[3]*x[kdx]; 25560fa040f9SShri Abhyankar s[4] -= v[4]*x[kdx]; 25570fa040f9SShri Abhyankar s[5] -= v[5]*x[kdx]; 25580fa040f9SShri Abhyankar s[6] -= v[6]*x[kdx]; 25590fa040f9SShri Abhyankar s[7] -= v[7]*x[kdx]; 25600fa040f9SShri Abhyankar s[8] -= v[8]*x[kdx]; 25610fa040f9SShri Abhyankar s[9] -= v[9]*x[kdx]; 25620fa040f9SShri Abhyankar s[10] -= v[10]*x[kdx]; 25630fa040f9SShri Abhyankar s[11] -= v[11]*x[kdx]; 25640fa040f9SShri Abhyankar s[12] -= v[12]*x[kdx]; 25650fa040f9SShri Abhyankar s[13] -= v[13]*x[kdx]; 25660fa040f9SShri Abhyankar s[14] -= v[14]*x[kdx]; 25670b8f6341SShri Abhyankar v += 15; 25680b8f6341SShri Abhyankar } 25690b8f6341SShri Abhyankar } 25700fa040f9SShri Abhyankar ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr); 25710b8f6341SShri Abhyankar for(k=0;k<15;k++){ 25720fa040f9SShri Abhyankar x[idt] += v[0]*s[k]; 25730fa040f9SShri Abhyankar x[1+idt] += v[1]*s[k]; 25740fa040f9SShri Abhyankar x[2+idt] += v[2]*s[k]; 25750fa040f9SShri Abhyankar x[3+idt] += v[3]*s[k]; 25760fa040f9SShri Abhyankar x[4+idt] += v[4]*s[k]; 25770fa040f9SShri Abhyankar x[5+idt] += v[5]*s[k]; 25780fa040f9SShri Abhyankar x[6+idt] += v[6]*s[k]; 25790fa040f9SShri Abhyankar x[7+idt] += v[7]*s[k]; 25800fa040f9SShri Abhyankar x[8+idt] += v[8]*s[k]; 25810fa040f9SShri Abhyankar x[9+idt] += v[9]*s[k]; 25820fa040f9SShri Abhyankar x[10+idt] += v[10]*s[k]; 25830fa040f9SShri Abhyankar x[11+idt] += v[11]*s[k]; 25840fa040f9SShri Abhyankar x[12+idt] += v[12]*s[k]; 25850fa040f9SShri Abhyankar x[13+idt] += v[13]*s[k]; 25860fa040f9SShri Abhyankar x[14+idt] += v[14]*s[k]; 25870b8f6341SShri Abhyankar v += 15; 25880b8f6341SShri Abhyankar } 25890b8f6341SShri Abhyankar } 2590*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 25910b8f6341SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 25920b8f6341SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 25930b8f6341SShri Abhyankar PetscFunctionReturn(0); 25940b8f6341SShri Abhyankar } 25950b8f6341SShri Abhyankar 25960b8f6341SShri Abhyankar 25970b8f6341SShri Abhyankar #undef __FUNCT__ 259806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 259906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 26004e2b4712SSatish Balay { 26014e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 26024e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 26036849ba73SBarry Smith PetscErrorCode ierr; 2604b3260449SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j; 2605b3260449SShri Abhyankar const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 2606b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 2607b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2608b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2609b3260449SShri Abhyankar const PetscScalar *b; 26104e2b4712SSatish Balay 26114e2b4712SSatish Balay PetscFunctionBegin; 2612*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 26131ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2614f1af5d2fSBarry Smith t = a->solve_work; 26154e2b4712SSatish Balay 26164e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 26174e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 26184e2b4712SSatish Balay 26194e2b4712SSatish Balay /* forward solve the lower triangular */ 26204e2b4712SSatish Balay idx = 7*(*r++); 2621f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2622f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2623f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 26244e2b4712SSatish Balay 26254e2b4712SSatish Balay for (i=1; i<n; i++) { 26264e2b4712SSatish Balay v = aa + 49*ai[i]; 26274e2b4712SSatish Balay vi = aj + ai[i]; 26284e2b4712SSatish Balay nz = diag[i] - ai[i]; 26294e2b4712SSatish Balay idx = 7*(*r++); 2630f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2631f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 26324e2b4712SSatish Balay while (nz--) { 26334e2b4712SSatish Balay idx = 7*(*vi++); 2634f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2635f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2636f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 2637f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2638f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2639f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2640f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2641f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2642f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2643f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 26444e2b4712SSatish Balay v += 49; 26454e2b4712SSatish Balay } 26464e2b4712SSatish Balay idx = 7*i; 2647f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2648f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2649f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 26504e2b4712SSatish Balay } 26514e2b4712SSatish Balay /* backward solve the upper triangular */ 26524e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 26534e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 26544e2b4712SSatish Balay vi = aj + diag[i] + 1; 26554e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 26564e2b4712SSatish Balay idt = 7*i; 2657f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2658f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2659f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 26604e2b4712SSatish Balay while (nz--) { 26614e2b4712SSatish Balay idx = 7*(*vi++); 2662f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2663f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2664f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 2665f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2666f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2667f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2668f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2669f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2670f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2671f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 26724e2b4712SSatish Balay v += 49; 26734e2b4712SSatish Balay } 26744e2b4712SSatish Balay idc = 7*(*c--); 26754e2b4712SSatish Balay v = aa + 49*diag[i]; 2676f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2677f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2678f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2679f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2680f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2681f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2682f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2683f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2684f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2685f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2686f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2687f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2688f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2689f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 26904e2b4712SSatish Balay } 26914e2b4712SSatish Balay 26924e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 26934e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2694*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 26951ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2696dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 26974e2b4712SSatish Balay PetscFunctionReturn(0); 26984e2b4712SSatish Balay } 26994e2b4712SSatish Balay 27008f690400SShri Abhyankar #undef __FUNCT__ 27014dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7" 27024dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 270335aa4fcfSShri Abhyankar { 270435aa4fcfSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 270535aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 270635aa4fcfSShri Abhyankar PetscErrorCode ierr; 2707b3260449SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 2708b3260449SShri Abhyankar const PetscInt n=a->mbs,*rout,*cout,*vi; 2709b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 2710b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2711b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2712b3260449SShri Abhyankar const PetscScalar *b; 271335aa4fcfSShri Abhyankar 271435aa4fcfSShri Abhyankar PetscFunctionBegin; 2715*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 271635aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 271735aa4fcfSShri Abhyankar t = a->solve_work; 271835aa4fcfSShri Abhyankar 271935aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 272035aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 272135aa4fcfSShri Abhyankar 272235aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 272335aa4fcfSShri Abhyankar idx = 7*r[0]; 272435aa4fcfSShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 272535aa4fcfSShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 272635aa4fcfSShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 272735aa4fcfSShri Abhyankar 272835aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 272935aa4fcfSShri Abhyankar v = aa + 49*ai[i]; 273035aa4fcfSShri Abhyankar vi = aj + ai[i]; 273135aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 273235aa4fcfSShri Abhyankar idx = 7*r[i]; 273335aa4fcfSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 273435aa4fcfSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 273535aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 273635aa4fcfSShri Abhyankar idx = 7*vi[m]; 273735aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 273835aa4fcfSShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 273935aa4fcfSShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 274035aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 274135aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 274235aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 274335aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 274435aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 274535aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 274635aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 274735aa4fcfSShri Abhyankar v += 49; 274835aa4fcfSShri Abhyankar } 274935aa4fcfSShri Abhyankar idx = 7*i; 275035aa4fcfSShri Abhyankar t[idx] = s1;t[1+idx] = s2; 275135aa4fcfSShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 275235aa4fcfSShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 275335aa4fcfSShri Abhyankar } 275435aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 275535aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 275635aa4fcfSShri Abhyankar v = aa + 49*(adiag[i+1]+1); 275735aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 275835aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 275935aa4fcfSShri Abhyankar idt = 7*i; 276035aa4fcfSShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 276135aa4fcfSShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 276235aa4fcfSShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 276335aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 276435aa4fcfSShri Abhyankar idx = 7*vi[m]; 276535aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 276635aa4fcfSShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 276735aa4fcfSShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 276835aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 276935aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 277035aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 277135aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 277235aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 277335aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 277435aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 277535aa4fcfSShri Abhyankar v += 49; 277635aa4fcfSShri Abhyankar } 277735aa4fcfSShri Abhyankar idc = 7*c[i]; 277835aa4fcfSShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 277935aa4fcfSShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 278035aa4fcfSShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 278135aa4fcfSShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 278235aa4fcfSShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 278335aa4fcfSShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 278435aa4fcfSShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 278535aa4fcfSShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 278635aa4fcfSShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 278735aa4fcfSShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 278835aa4fcfSShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 278935aa4fcfSShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 279035aa4fcfSShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 279135aa4fcfSShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 279235aa4fcfSShri Abhyankar } 279335aa4fcfSShri Abhyankar 279435aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 279535aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2796*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 279735aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 279835aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 279935aa4fcfSShri Abhyankar PetscFunctionReturn(0); 280035aa4fcfSShri Abhyankar } 280135aa4fcfSShri Abhyankar 280235aa4fcfSShri Abhyankar #undef __FUNCT__ 280306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 280406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 280515091d37SBarry Smith { 280615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2807b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2808dfbe8321SBarry Smith PetscErrorCode ierr; 2809b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 2810d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2811d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2812d9fead3dSBarry Smith const PetscScalar *b; 281315091d37SBarry Smith 281415091d37SBarry Smith PetscFunctionBegin; 2815*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 28161ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 281715091d37SBarry Smith /* forward solve the lower triangular */ 281815091d37SBarry Smith idx = 0; 281915091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 282015091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 282115091d37SBarry Smith x[6] = b[6+idx]; 282215091d37SBarry Smith for (i=1; i<n; i++) { 282315091d37SBarry Smith v = aa + 49*ai[i]; 282415091d37SBarry Smith vi = aj + ai[i]; 282515091d37SBarry Smith nz = diag[i] - ai[i]; 282615091d37SBarry Smith idx = 7*i; 2827f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2828f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2829f1af5d2fSBarry Smith s7 = b[6+idx]; 283015091d37SBarry Smith while (nz--) { 283115091d37SBarry Smith jdx = 7*(*vi++); 283215091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 283315091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 283415091d37SBarry Smith x7 = x[6+jdx]; 2835f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2836f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2837f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2838f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2839f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2840f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2841f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 284215091d37SBarry Smith v += 49; 284315091d37SBarry Smith } 2844f1af5d2fSBarry Smith x[idx] = s1; 2845f1af5d2fSBarry Smith x[1+idx] = s2; 2846f1af5d2fSBarry Smith x[2+idx] = s3; 2847f1af5d2fSBarry Smith x[3+idx] = s4; 2848f1af5d2fSBarry Smith x[4+idx] = s5; 2849f1af5d2fSBarry Smith x[5+idx] = s6; 2850f1af5d2fSBarry Smith x[6+idx] = s7; 285115091d37SBarry Smith } 285215091d37SBarry Smith /* backward solve the upper triangular */ 285315091d37SBarry Smith for (i=n-1; i>=0; i--){ 285415091d37SBarry Smith v = aa + 49*diag[i] + 49; 285515091d37SBarry Smith vi = aj + diag[i] + 1; 285615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 285715091d37SBarry Smith idt = 7*i; 2858f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2859f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 2860f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 2861f1af5d2fSBarry Smith s7 = x[6+idt]; 286215091d37SBarry Smith while (nz--) { 286315091d37SBarry Smith idx = 7*(*vi++); 286415091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 286515091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 286615091d37SBarry Smith x7 = x[6+idx]; 2867f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2868f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2869f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2870f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2871f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2872f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2873f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 287415091d37SBarry Smith v += 49; 287515091d37SBarry Smith } 287615091d37SBarry Smith v = aa + 49*diag[i]; 2877f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2878f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 2879f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2880f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 2881f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2882f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 2883f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2884f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 2885f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2886f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 2887f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2888f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 2889f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2890f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 289115091d37SBarry Smith } 289215091d37SBarry Smith 2893*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 28941ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2895dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 289615091d37SBarry Smith PetscFunctionReturn(0); 289715091d37SBarry Smith } 289815091d37SBarry Smith 2899cee9d6f2SShri Abhyankar #undef __FUNCT__ 29004dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 29014dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 290253cca76cSShri Abhyankar { 290353cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2904b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 290553cca76cSShri Abhyankar PetscErrorCode ierr; 2906b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 2907b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 290853cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 290953cca76cSShri Abhyankar PetscScalar *x; 291053cca76cSShri Abhyankar const PetscScalar *b; 291153cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 291253cca76cSShri Abhyankar 291353cca76cSShri Abhyankar PetscFunctionBegin; 2914*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 291553cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 291653cca76cSShri Abhyankar /* forward solve the lower triangular */ 291753cca76cSShri Abhyankar idx = 0; 291853cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 291953cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 292053cca76cSShri Abhyankar for (i=1; i<n; i++) { 292153cca76cSShri Abhyankar v = aa + bs2*ai[i]; 292253cca76cSShri Abhyankar vi = aj + ai[i]; 292353cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 292453cca76cSShri Abhyankar idx = bs*i; 292553cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 292653cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 292753cca76cSShri Abhyankar for(k=0;k<nz;k++) { 292853cca76cSShri Abhyankar jdx = bs*vi[k]; 292953cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 293053cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 293153cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 293253cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 293353cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 293453cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 293553cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 293653cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 293753cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 293853cca76cSShri Abhyankar v += bs2; 293953cca76cSShri Abhyankar } 294053cca76cSShri Abhyankar 294153cca76cSShri Abhyankar x[idx] = s1; 294253cca76cSShri Abhyankar x[1+idx] = s2; 294353cca76cSShri Abhyankar x[2+idx] = s3; 294453cca76cSShri Abhyankar x[3+idx] = s4; 294553cca76cSShri Abhyankar x[4+idx] = s5; 294653cca76cSShri Abhyankar x[5+idx] = s6; 294753cca76cSShri Abhyankar x[6+idx] = s7; 294853cca76cSShri Abhyankar } 294953cca76cSShri Abhyankar 295053cca76cSShri Abhyankar /* backward solve the upper triangular */ 295153cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 295253cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 295353cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 295453cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 295553cca76cSShri Abhyankar idt = bs*i; 295653cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 295753cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 295853cca76cSShri Abhyankar for(k=0;k<nz;k++) { 295953cca76cSShri Abhyankar idx = bs*vi[k]; 296053cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 296153cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 296253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 296353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 296453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 296553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 296653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 296753cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 296853cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 296953cca76cSShri Abhyankar v += bs2; 297053cca76cSShri Abhyankar } 297153cca76cSShri Abhyankar /* x = inv_diagonal*x */ 297253cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 297353cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 297453cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 297553cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 297653cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 297753cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 297853cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 297953cca76cSShri Abhyankar } 298053cca76cSShri Abhyankar 2981*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 298253cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 298353cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 298453cca76cSShri Abhyankar PetscFunctionReturn(0); 298553cca76cSShri Abhyankar } 298653cca76cSShri Abhyankar 298753cca76cSShri Abhyankar #undef __FUNCT__ 298806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 298906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 299015091d37SBarry Smith { 299115091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 299215091d37SBarry Smith IS iscol=a->col,isrow=a->row; 29936849ba73SBarry Smith PetscErrorCode ierr; 29945d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 2995b3260449SShri Abhyankar const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2996b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 2997d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2998d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2999d9fead3dSBarry Smith const PetscScalar *b; 3000b3260449SShri Abhyankar 300115091d37SBarry Smith PetscFunctionBegin; 3002*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 30031ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3004f1af5d2fSBarry Smith t = a->solve_work; 300515091d37SBarry Smith 300615091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 300715091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 300815091d37SBarry Smith 300915091d37SBarry Smith /* forward solve the lower triangular */ 301015091d37SBarry Smith idx = 6*(*r++); 3011f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3012f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 3013f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 301415091d37SBarry Smith for (i=1; i<n; i++) { 301515091d37SBarry Smith v = aa + 36*ai[i]; 301615091d37SBarry Smith vi = aj + ai[i]; 301715091d37SBarry Smith nz = diag[i] - ai[i]; 301815091d37SBarry Smith idx = 6*(*r++); 3019f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3020f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 302115091d37SBarry Smith while (nz--) { 302215091d37SBarry Smith idx = 6*(*vi++); 3023f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3024f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 3025f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3026f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3027f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3028f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3029f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3030f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 303115091d37SBarry Smith v += 36; 303215091d37SBarry Smith } 303315091d37SBarry Smith idx = 6*i; 3034f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3035f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 3036f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 303715091d37SBarry Smith } 303815091d37SBarry Smith /* backward solve the upper triangular */ 303915091d37SBarry Smith for (i=n-1; i>=0; i--){ 304015091d37SBarry Smith v = aa + 36*diag[i] + 36; 304115091d37SBarry Smith vi = aj + diag[i] + 1; 304215091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 304315091d37SBarry Smith idt = 6*i; 3044f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3045f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 3046f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 304715091d37SBarry Smith while (nz--) { 304815091d37SBarry Smith idx = 6*(*vi++); 3049f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3050f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 3051f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 3052f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3053f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3054f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3055f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3056f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3057f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 305815091d37SBarry Smith v += 36; 305915091d37SBarry Smith } 306015091d37SBarry Smith idc = 6*(*c--); 306115091d37SBarry Smith v = aa + 36*diag[i]; 3062f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3063f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 3064f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3065f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 3066f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3067f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 3068f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3069f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 3070f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3071f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 3072f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3073f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 307415091d37SBarry Smith } 307515091d37SBarry Smith 307615091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 307715091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3078*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 30791ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3080dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 308115091d37SBarry Smith PetscFunctionReturn(0); 308215091d37SBarry Smith } 308315091d37SBarry Smith 30846506fda5SShri Abhyankar #undef __FUNCT__ 30854dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6" 30864dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 30876506fda5SShri Abhyankar { 30886506fda5SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 30896506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 30906506fda5SShri Abhyankar PetscErrorCode ierr; 30916506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 3092b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3093b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 30946506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 30956506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 30966506fda5SShri Abhyankar const PetscScalar *b; 3097b3260449SShri Abhyankar 30986506fda5SShri Abhyankar PetscFunctionBegin; 3099*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 31006506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 31016506fda5SShri Abhyankar t = a->solve_work; 31026506fda5SShri Abhyankar 31036506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 31046506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 31056506fda5SShri Abhyankar 31066506fda5SShri Abhyankar /* forward solve the lower triangular */ 31076506fda5SShri Abhyankar idx = 6*r[0]; 31086506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 31096506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 31106506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 31116506fda5SShri Abhyankar for (i=1; i<n; i++) { 31126506fda5SShri Abhyankar v = aa + 36*ai[i]; 31136506fda5SShri Abhyankar vi = aj + ai[i]; 31146506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 31156506fda5SShri Abhyankar idx = 6*r[i]; 31166506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 31176506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 31186506fda5SShri Abhyankar for(m=0;m<nz;m++){ 31196506fda5SShri Abhyankar idx = 6*vi[m]; 31206506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 31216506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 31226506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 31236506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 31246506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 31256506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 31266506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 31276506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 31286506fda5SShri Abhyankar v += 36; 31296506fda5SShri Abhyankar } 31306506fda5SShri Abhyankar idx = 6*i; 31316506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 31326506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 31336506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 31346506fda5SShri Abhyankar } 31356506fda5SShri Abhyankar /* backward solve the upper triangular */ 31366506fda5SShri Abhyankar for (i=n-1; i>=0; i--){ 31376506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 31386506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 31396506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 31406506fda5SShri Abhyankar idt = 6*i; 31416506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 31426506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 31436506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 31446506fda5SShri Abhyankar for(m=0;m<nz;m++){ 31456506fda5SShri Abhyankar idx = 6*vi[m]; 31466506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 31476506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 31486506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 31496506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 31506506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 31516506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 31526506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 31536506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 31546506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 31556506fda5SShri Abhyankar v += 36; 31566506fda5SShri Abhyankar } 31576506fda5SShri Abhyankar idc = 6*c[i]; 31586506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 31596506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 31606506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 31616506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 31626506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 31636506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 31646506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 31656506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 31666506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 31676506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 31686506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 31696506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 31706506fda5SShri Abhyankar } 31716506fda5SShri Abhyankar 31726506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 31736506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3174*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 31756506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 31766506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 31776506fda5SShri Abhyankar PetscFunctionReturn(0); 31786506fda5SShri Abhyankar } 31798f690400SShri Abhyankar 31808f690400SShri Abhyankar #undef __FUNCT__ 318106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 318206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 318315091d37SBarry Smith { 318415091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3185b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 3186dfbe8321SBarry Smith PetscErrorCode ierr; 3187b3260449SShri Abhyankar const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 3188d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3189d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3190d9fead3dSBarry Smith const PetscScalar *b; 319115091d37SBarry Smith 319215091d37SBarry Smith PetscFunctionBegin; 3193*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 31941ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 319515091d37SBarry Smith /* forward solve the lower triangular */ 319615091d37SBarry Smith idx = 0; 319715091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 319815091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 319915091d37SBarry Smith for (i=1; i<n; i++) { 320015091d37SBarry Smith v = aa + 36*ai[i]; 320115091d37SBarry Smith vi = aj + ai[i]; 320215091d37SBarry Smith nz = diag[i] - ai[i]; 320315091d37SBarry Smith idx = 6*i; 3204f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3205f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 320615091d37SBarry Smith while (nz--) { 320715091d37SBarry Smith jdx = 6*(*vi++); 320815091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 320915091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 3210f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3211f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3212f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3213f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3214f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3215f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 321615091d37SBarry Smith v += 36; 321715091d37SBarry Smith } 3218f1af5d2fSBarry Smith x[idx] = s1; 3219f1af5d2fSBarry Smith x[1+idx] = s2; 3220f1af5d2fSBarry Smith x[2+idx] = s3; 3221f1af5d2fSBarry Smith x[3+idx] = s4; 3222f1af5d2fSBarry Smith x[4+idx] = s5; 3223f1af5d2fSBarry Smith x[5+idx] = s6; 322415091d37SBarry Smith } 322515091d37SBarry Smith /* backward solve the upper triangular */ 322615091d37SBarry Smith for (i=n-1; i>=0; i--){ 322715091d37SBarry Smith v = aa + 36*diag[i] + 36; 322815091d37SBarry Smith vi = aj + diag[i] + 1; 322915091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 323015091d37SBarry Smith idt = 6*i; 3231f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3232f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 3233f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 323415091d37SBarry Smith while (nz--) { 323515091d37SBarry Smith idx = 6*(*vi++); 323615091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 323715091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3238f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3239f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3240f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3241f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3242f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3243f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 324415091d37SBarry Smith v += 36; 324515091d37SBarry Smith } 324615091d37SBarry Smith v = aa + 36*diag[i]; 3247f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3248f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3249f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3250f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3251f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3252f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 325315091d37SBarry Smith } 325415091d37SBarry Smith 3255*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 32561ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3257dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 325815091d37SBarry Smith PetscFunctionReturn(0); 325915091d37SBarry Smith } 326015091d37SBarry Smith 3261cee9d6f2SShri Abhyankar #undef __FUNCT__ 32624dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 32634dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 326453cca76cSShri Abhyankar { 326553cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3266b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 326753cca76cSShri Abhyankar PetscErrorCode ierr; 3268b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 3269b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 327053cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 327153cca76cSShri Abhyankar PetscScalar *x; 327253cca76cSShri Abhyankar const PetscScalar *b; 327353cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 327453cca76cSShri Abhyankar 327553cca76cSShri Abhyankar PetscFunctionBegin; 3276*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 327753cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 327853cca76cSShri Abhyankar /* forward solve the lower triangular */ 327953cca76cSShri Abhyankar idx = 0; 328053cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 328153cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 328253cca76cSShri Abhyankar for (i=1; i<n; i++) { 328353cca76cSShri Abhyankar v = aa + bs2*ai[i]; 328453cca76cSShri Abhyankar vi = aj + ai[i]; 328553cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 328653cca76cSShri Abhyankar idx = bs*i; 328753cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 328853cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 328953cca76cSShri Abhyankar for(k=0;k<nz;k++){ 329053cca76cSShri Abhyankar jdx = bs*vi[k]; 329153cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 329253cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 329353cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 329453cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 329553cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 329653cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 329753cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 329853cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 329953cca76cSShri Abhyankar v += bs2; 330053cca76cSShri Abhyankar } 330153cca76cSShri Abhyankar 330253cca76cSShri Abhyankar x[idx] = s1; 330353cca76cSShri Abhyankar x[1+idx] = s2; 330453cca76cSShri Abhyankar x[2+idx] = s3; 330553cca76cSShri Abhyankar x[3+idx] = s4; 330653cca76cSShri Abhyankar x[4+idx] = s5; 330753cca76cSShri Abhyankar x[5+idx] = s6; 330853cca76cSShri Abhyankar } 330953cca76cSShri Abhyankar 331053cca76cSShri Abhyankar /* backward solve the upper triangular */ 331153cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 331253cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 331353cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 331453cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 331553cca76cSShri Abhyankar idt = bs*i; 331653cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 331753cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 331853cca76cSShri Abhyankar for(k=0;k<nz;k++){ 331953cca76cSShri Abhyankar idx = bs*vi[k]; 332053cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 332153cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 332253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 332353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 332453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 332553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 332653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 332753cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 332853cca76cSShri Abhyankar v += bs2; 332953cca76cSShri Abhyankar } 333053cca76cSShri Abhyankar /* x = inv_diagonal*x */ 333153cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 333253cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 333353cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 333453cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 333553cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 333653cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 333753cca76cSShri Abhyankar } 333853cca76cSShri Abhyankar 3339*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 334053cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 334153cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 334253cca76cSShri Abhyankar PetscFunctionReturn(0); 334353cca76cSShri Abhyankar } 334453cca76cSShri Abhyankar 334553cca76cSShri Abhyankar #undef __FUNCT__ 334606e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 334706e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 33484e2b4712SSatish Balay { 33494e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 33504e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 33516849ba73SBarry Smith PetscErrorCode ierr; 33525d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3353b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3354b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 3355d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3356d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3357d9fead3dSBarry Smith const PetscScalar *b; 33584e2b4712SSatish Balay 33594e2b4712SSatish Balay PetscFunctionBegin; 3360*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 33611ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3362f1af5d2fSBarry Smith t = a->solve_work; 33634e2b4712SSatish Balay 33644e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 33654e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 33664e2b4712SSatish Balay 33674e2b4712SSatish Balay /* forward solve the lower triangular */ 33684e2b4712SSatish Balay idx = 5*(*r++); 3369f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3370f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 33714e2b4712SSatish Balay for (i=1; i<n; i++) { 33724e2b4712SSatish Balay v = aa + 25*ai[i]; 33734e2b4712SSatish Balay vi = aj + ai[i]; 33744e2b4712SSatish Balay nz = diag[i] - ai[i]; 33754e2b4712SSatish Balay idx = 5*(*r++); 3376f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3377f1af5d2fSBarry Smith s5 = b[4+idx]; 33784e2b4712SSatish Balay while (nz--) { 33794e2b4712SSatish Balay idx = 5*(*vi++); 3380f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3381f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 3382f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3383f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3384f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3385f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3386f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 33874e2b4712SSatish Balay v += 25; 33884e2b4712SSatish Balay } 33894e2b4712SSatish Balay idx = 5*i; 3390f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3391f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 33924e2b4712SSatish Balay } 33934e2b4712SSatish Balay /* backward solve the upper triangular */ 33944e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 33954e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 33964e2b4712SSatish Balay vi = aj + diag[i] + 1; 33974e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 33984e2b4712SSatish Balay idt = 5*i; 3399f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3400f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 34014e2b4712SSatish Balay while (nz--) { 34024e2b4712SSatish Balay idx = 5*(*vi++); 3403f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3404f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3405f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3406f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3407f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3408f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3409f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 34104e2b4712SSatish Balay v += 25; 34114e2b4712SSatish Balay } 34124e2b4712SSatish Balay idc = 5*(*c--); 34134e2b4712SSatish Balay v = aa + 25*diag[i]; 3414f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3415f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 3416f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3417f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 3418f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3419f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 3420f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3421f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 3422f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3423f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 34244e2b4712SSatish Balay } 34254e2b4712SSatish Balay 34264e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 34274e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3428*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 34291ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3430dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 34314e2b4712SSatish Balay PetscFunctionReturn(0); 34324e2b4712SSatish Balay } 34334e2b4712SSatish Balay 343478bb4007SShri Abhyankar #undef __FUNCT__ 34354dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5" 34364dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 343778bb4007SShri Abhyankar { 343878bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 343978bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 344078bb4007SShri Abhyankar PetscErrorCode ierr; 344178bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 3442b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3443b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 344478bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 344578bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 344678bb4007SShri Abhyankar const PetscScalar *b; 344778bb4007SShri Abhyankar 344878bb4007SShri Abhyankar PetscFunctionBegin; 3449*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 345078bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 345178bb4007SShri Abhyankar t = a->solve_work; 345278bb4007SShri Abhyankar 345378bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 345478bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 345578bb4007SShri Abhyankar 345678bb4007SShri Abhyankar /* forward solve the lower triangular */ 345778bb4007SShri Abhyankar idx = 5*r[0]; 345878bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 345978bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 346078bb4007SShri Abhyankar for (i=1; i<n; i++) { 346178bb4007SShri Abhyankar v = aa + 25*ai[i]; 346278bb4007SShri Abhyankar vi = aj + ai[i]; 346378bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 346478bb4007SShri Abhyankar idx = 5*r[i]; 346578bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 346678bb4007SShri Abhyankar s5 = b[4+idx]; 346778bb4007SShri Abhyankar for(m=0;m<nz;m++){ 346878bb4007SShri Abhyankar idx = 5*vi[m]; 346978bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 347078bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 347178bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 347278bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 347378bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 347478bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 347578bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 347678bb4007SShri Abhyankar v += 25; 347778bb4007SShri Abhyankar } 347878bb4007SShri Abhyankar idx = 5*i; 347978bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 348078bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 348178bb4007SShri Abhyankar } 348278bb4007SShri Abhyankar /* backward solve the upper triangular */ 348378bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 348478bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 348578bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 348678bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 348778bb4007SShri Abhyankar idt = 5*i; 348878bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 348978bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 349078bb4007SShri Abhyankar for(m=0;m<nz;m++){ 349178bb4007SShri Abhyankar idx = 5*vi[m]; 349278bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 349378bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 349478bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 349578bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 349678bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 349778bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 349878bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 349978bb4007SShri Abhyankar v += 25; 350078bb4007SShri Abhyankar } 350178bb4007SShri Abhyankar idc = 5*c[i]; 350278bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 350378bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 350478bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 350578bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 350678bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 350778bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 350878bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 350978bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 351078bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 351178bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 351278bb4007SShri Abhyankar } 351378bb4007SShri Abhyankar 351478bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 351578bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3516*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 351778bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 351878bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 351978bb4007SShri Abhyankar PetscFunctionReturn(0); 352078bb4007SShri Abhyankar } 352178bb4007SShri Abhyankar 35228f690400SShri Abhyankar #undef __FUNCT__ 352306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 352406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 352515091d37SBarry Smith { 352615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3527b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3528b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 3529dfbe8321SBarry Smith PetscErrorCode ierr; 3530d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3531d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3532d9fead3dSBarry Smith const PetscScalar *b; 353315091d37SBarry Smith 353415091d37SBarry Smith PetscFunctionBegin; 3535*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 35361ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 353715091d37SBarry Smith /* forward solve the lower triangular */ 353815091d37SBarry Smith idx = 0; 353915091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 354015091d37SBarry Smith for (i=1; i<n; i++) { 354115091d37SBarry Smith v = aa + 25*ai[i]; 354215091d37SBarry Smith vi = aj + ai[i]; 354315091d37SBarry Smith nz = diag[i] - ai[i]; 354415091d37SBarry Smith idx = 5*i; 3545f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 354615091d37SBarry Smith while (nz--) { 354715091d37SBarry Smith jdx = 5*(*vi++); 354815091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3549f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3550f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3551f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3552f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3553f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 355415091d37SBarry Smith v += 25; 355515091d37SBarry Smith } 3556f1af5d2fSBarry Smith x[idx] = s1; 3557f1af5d2fSBarry Smith x[1+idx] = s2; 3558f1af5d2fSBarry Smith x[2+idx] = s3; 3559f1af5d2fSBarry Smith x[3+idx] = s4; 3560f1af5d2fSBarry Smith x[4+idx] = s5; 356115091d37SBarry Smith } 356215091d37SBarry Smith /* backward solve the upper triangular */ 356315091d37SBarry Smith for (i=n-1; i>=0; i--){ 356415091d37SBarry Smith v = aa + 25*diag[i] + 25; 356515091d37SBarry Smith vi = aj + diag[i] + 1; 356615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 356715091d37SBarry Smith idt = 5*i; 3568f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3569f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 357015091d37SBarry Smith while (nz--) { 357115091d37SBarry Smith idx = 5*(*vi++); 357215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3573f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3574f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3575f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3576f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3577f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 357815091d37SBarry Smith v += 25; 357915091d37SBarry Smith } 358015091d37SBarry Smith v = aa + 25*diag[i]; 3581f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3582f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3583f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3584f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3585f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 358615091d37SBarry Smith } 358715091d37SBarry Smith 3588*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 35891ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3590dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 359115091d37SBarry Smith PetscFunctionReturn(0); 359215091d37SBarry Smith } 359315091d37SBarry Smith 3594cee9d6f2SShri Abhyankar #undef __FUNCT__ 35954dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 35964dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 359753cca76cSShri Abhyankar { 359853cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3599b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3600b3260449SShri Abhyankar PetscInt i,k,nz,idx,idt,jdx; 360153cca76cSShri Abhyankar PetscErrorCode ierr; 360253cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 360353cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 360453cca76cSShri Abhyankar const PetscScalar *b; 360553cca76cSShri Abhyankar 360653cca76cSShri Abhyankar PetscFunctionBegin; 3607*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 360853cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 360953cca76cSShri Abhyankar /* forward solve the lower triangular */ 361053cca76cSShri Abhyankar idx = 0; 361153cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 361253cca76cSShri Abhyankar for (i=1; i<n; i++) { 361353cca76cSShri Abhyankar v = aa + 25*ai[i]; 361453cca76cSShri Abhyankar vi = aj + ai[i]; 361553cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 361653cca76cSShri Abhyankar idx = 5*i; 361753cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 361853cca76cSShri Abhyankar for(k=0;k<nz;k++) { 361953cca76cSShri Abhyankar jdx = 5*vi[k]; 362053cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 362153cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 362253cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 362353cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 362453cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 362553cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 362653cca76cSShri Abhyankar v += 25; 362753cca76cSShri Abhyankar } 362853cca76cSShri Abhyankar x[idx] = s1; 362953cca76cSShri Abhyankar x[1+idx] = s2; 363053cca76cSShri Abhyankar x[2+idx] = s3; 363153cca76cSShri Abhyankar x[3+idx] = s4; 363253cca76cSShri Abhyankar x[4+idx] = s5; 363353cca76cSShri Abhyankar } 363453cca76cSShri Abhyankar 363553cca76cSShri Abhyankar /* backward solve the upper triangular */ 363653cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 363753cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 363853cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 363953cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 364053cca76cSShri Abhyankar idt = 5*i; 364153cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 364253cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 364353cca76cSShri Abhyankar for(k=0;k<nz;k++){ 364453cca76cSShri Abhyankar idx = 5*vi[k]; 364553cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 364653cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 364753cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 364853cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 364953cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 365053cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 365153cca76cSShri Abhyankar v += 25; 365253cca76cSShri Abhyankar } 365353cca76cSShri Abhyankar /* x = inv_diagonal*x */ 365453cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 365553cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 365653cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 365753cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 365853cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 365953cca76cSShri Abhyankar } 366053cca76cSShri Abhyankar 3661*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 366253cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 366353cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 366453cca76cSShri Abhyankar PetscFunctionReturn(0); 366553cca76cSShri Abhyankar } 366653cca76cSShri Abhyankar 366753cca76cSShri Abhyankar #undef __FUNCT__ 366806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 366906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 36704e2b4712SSatish Balay { 36714e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 36724e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 36736849ba73SBarry Smith PetscErrorCode ierr; 3674b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3675b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 36765d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3677d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3678d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3679d9fead3dSBarry Smith const PetscScalar *b; 36804e2b4712SSatish Balay 36814e2b4712SSatish Balay PetscFunctionBegin; 3682*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 36831ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3684f1af5d2fSBarry Smith t = a->solve_work; 36854e2b4712SSatish Balay 36864e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 36874e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 36884e2b4712SSatish Balay 36894e2b4712SSatish Balay /* forward solve the lower triangular */ 36904e2b4712SSatish Balay idx = 4*(*r++); 3691f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3692f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 36934e2b4712SSatish Balay for (i=1; i<n; i++) { 36944e2b4712SSatish Balay v = aa + 16*ai[i]; 36954e2b4712SSatish Balay vi = aj + ai[i]; 36964e2b4712SSatish Balay nz = diag[i] - ai[i]; 36974e2b4712SSatish Balay idx = 4*(*r++); 3698f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 36994e2b4712SSatish Balay while (nz--) { 37004e2b4712SSatish Balay idx = 4*(*vi++); 3701f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3702f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3703f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3704f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3705f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 37064e2b4712SSatish Balay v += 16; 37074e2b4712SSatish Balay } 37084e2b4712SSatish Balay idx = 4*i; 3709f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3710f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 37114e2b4712SSatish Balay } 37124e2b4712SSatish Balay /* backward solve the upper triangular */ 37134e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 37144e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 37154e2b4712SSatish Balay vi = aj + diag[i] + 1; 37164e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 37174e2b4712SSatish Balay idt = 4*i; 3718f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3719f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 37204e2b4712SSatish Balay while (nz--) { 37214e2b4712SSatish Balay idx = 4*(*vi++); 3722f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3723f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 3724f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3725f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3726f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3727f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 37284e2b4712SSatish Balay v += 16; 37294e2b4712SSatish Balay } 37304e2b4712SSatish Balay idc = 4*(*c--); 37314e2b4712SSatish Balay v = aa + 16*diag[i]; 3732f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3733f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3734f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3735f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 37364e2b4712SSatish Balay } 37374e2b4712SSatish Balay 37384e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 37394e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3740*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 37411ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3742dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 37434e2b4712SSatish Balay PetscFunctionReturn(0); 37444e2b4712SSatish Balay } 3745f26ec98cSKris Buschelman 37468f690400SShri Abhyankar #undef __FUNCT__ 37474dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4" 37484dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 374978bb4007SShri Abhyankar { 375078bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 375178bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 375278bb4007SShri Abhyankar PetscErrorCode ierr; 3753b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3754b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 375578bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 375678bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 375778bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 375878bb4007SShri Abhyankar const PetscScalar *b; 375978bb4007SShri Abhyankar 376078bb4007SShri Abhyankar PetscFunctionBegin; 3761*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 376278bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 376378bb4007SShri Abhyankar t = a->solve_work; 376478bb4007SShri Abhyankar 376578bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 376678bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 376778bb4007SShri Abhyankar 376878bb4007SShri Abhyankar /* forward solve the lower triangular */ 376978bb4007SShri Abhyankar idx = 4*r[0]; 377078bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 377178bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 377278bb4007SShri Abhyankar for (i=1; i<n; i++) { 377378bb4007SShri Abhyankar v = aa + 16*ai[i]; 377478bb4007SShri Abhyankar vi = aj + ai[i]; 377578bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 377678bb4007SShri Abhyankar idx = 4*r[i]; 377778bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 377878bb4007SShri Abhyankar for(m=0;m<nz;m++){ 377978bb4007SShri Abhyankar idx = 4*vi[m]; 378078bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 378178bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 378278bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 378378bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 378478bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 378578bb4007SShri Abhyankar v += 16; 378678bb4007SShri Abhyankar } 378778bb4007SShri Abhyankar idx = 4*i; 378878bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 378978bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 379078bb4007SShri Abhyankar } 379178bb4007SShri Abhyankar /* backward solve the upper triangular */ 379278bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 379378bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 379478bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 379578bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 379678bb4007SShri Abhyankar idt = 4*i; 379778bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 379878bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 379978bb4007SShri Abhyankar for(m=0;m<nz;m++){ 380078bb4007SShri Abhyankar idx = 4*vi[m]; 380178bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 380278bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 380378bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 380478bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 380578bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 380678bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 380778bb4007SShri Abhyankar v += 16; 380878bb4007SShri Abhyankar } 380978bb4007SShri Abhyankar idc = 4*c[i]; 381078bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 381178bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 381278bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 381378bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 381478bb4007SShri Abhyankar } 381578bb4007SShri Abhyankar 381678bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 381778bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3818*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 381978bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 382078bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 382178bb4007SShri Abhyankar PetscFunctionReturn(0); 382278bb4007SShri Abhyankar } 382378bb4007SShri Abhyankar 382478bb4007SShri Abhyankar #undef __FUNCT__ 3825f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3826dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3827f26ec98cSKris Buschelman { 3828f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3829f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 38306849ba73SBarry Smith PetscErrorCode ierr; 3831b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3832b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 38335d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3834d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3835d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3836d9fead3dSBarry Smith PetscScalar *x; 3837d9fead3dSBarry Smith const PetscScalar *b; 3838f26ec98cSKris Buschelman 3839f26ec98cSKris Buschelman PetscFunctionBegin; 3840*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 38411ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3842f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 3843f26ec98cSKris Buschelman 3844f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3845f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3846f26ec98cSKris Buschelman 3847f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3848f26ec98cSKris Buschelman idx = 4*(*r++); 3849f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 3850f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 3851f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 3852f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 3853f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3854f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3855f26ec98cSKris Buschelman vi = aj + ai[i]; 3856f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3857f26ec98cSKris Buschelman idx = 4*(*r++); 3858f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3859f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3860f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3861f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3862f26ec98cSKris Buschelman while (nz--) { 3863f26ec98cSKris Buschelman idx = 4*(*vi++); 3864f26ec98cSKris Buschelman x1 = t[idx]; 3865f26ec98cSKris Buschelman x2 = t[1+idx]; 3866f26ec98cSKris Buschelman x3 = t[2+idx]; 3867f26ec98cSKris Buschelman x4 = t[3+idx]; 3868f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3869f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3870f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3871f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3872f26ec98cSKris Buschelman v += 16; 3873f26ec98cSKris Buschelman } 3874f26ec98cSKris Buschelman idx = 4*i; 3875f26ec98cSKris Buschelman t[idx] = s1; 3876f26ec98cSKris Buschelman t[1+idx] = s2; 3877f26ec98cSKris Buschelman t[2+idx] = s3; 3878f26ec98cSKris Buschelman t[3+idx] = s4; 3879f26ec98cSKris Buschelman } 3880f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3881f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3882f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 3883f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3884f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3885f26ec98cSKris Buschelman idt = 4*i; 3886f26ec98cSKris Buschelman s1 = t[idt]; 3887f26ec98cSKris Buschelman s2 = t[1+idt]; 3888f26ec98cSKris Buschelman s3 = t[2+idt]; 3889f26ec98cSKris Buschelman s4 = t[3+idt]; 3890f26ec98cSKris Buschelman while (nz--) { 3891f26ec98cSKris Buschelman idx = 4*(*vi++); 3892f26ec98cSKris Buschelman x1 = t[idx]; 3893f26ec98cSKris Buschelman x2 = t[1+idx]; 3894f26ec98cSKris Buschelman x3 = t[2+idx]; 3895f26ec98cSKris Buschelman x4 = t[3+idx]; 3896f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3897f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3898f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3899f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3900f26ec98cSKris Buschelman v += 16; 3901f26ec98cSKris Buschelman } 3902f26ec98cSKris Buschelman idc = 4*(*c--); 3903f26ec98cSKris Buschelman v = aa + 16*diag[i]; 3904f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3905f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3906f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3907f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3908f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 3909f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 3910f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 3911f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 3912f26ec98cSKris Buschelman } 3913f26ec98cSKris Buschelman 3914f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3915f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3916*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 39171ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3918dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3919f26ec98cSKris Buschelman PetscFunctionReturn(0); 3920f26ec98cSKris Buschelman } 3921f26ec98cSKris Buschelman 392224c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 392324c233c2SKris Buschelman 392424c233c2SKris Buschelman #include PETSC_HAVE_SSE 392524c233c2SKris Buschelman 392624c233c2SKris Buschelman #undef __FUNCT__ 392724c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3928dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 392924c233c2SKris Buschelman { 393024c233c2SKris Buschelman /* 393124c233c2SKris Buschelman Note: This code uses demotion of double 393224c233c2SKris Buschelman to float when performing the mixed-mode computation. 393324c233c2SKris Buschelman This may not be numerically reasonable for all applications. 393424c233c2SKris Buschelman */ 393524c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 393624c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 39376849ba73SBarry Smith PetscErrorCode ierr; 39385d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 39395d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 394024c233c2SKris Buschelman MatScalar *aa=a->a,*v; 394187828ca2SBarry Smith PetscScalar *x,*b,*t; 394224c233c2SKris Buschelman 394324c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 394424c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 394524c233c2SKris Buschelman unsigned long offset; 394624c233c2SKris Buschelman 394724c233c2SKris Buschelman PetscFunctionBegin; 394824c233c2SKris Buschelman SSE_SCOPE_BEGIN; 394924c233c2SKris Buschelman 395024c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 395124c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 395224c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 395324c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 395424c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 395524c233c2SKris Buschelman 39561ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 39571ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 395824c233c2SKris Buschelman t = a->solve_work; 395924c233c2SKris Buschelman 396024c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 396124c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 396224c233c2SKris Buschelman 396324c233c2SKris Buschelman /* forward solve the lower triangular */ 396424c233c2SKris Buschelman idx = 4*(*r++); 396524c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 396624c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 396724c233c2SKris Buschelman v = aa + 16*ai[1]; 396824c233c2SKris Buschelman 396924c233c2SKris Buschelman for (i=1; i<n;) { 397024c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 397124c233c2SKris Buschelman vi = aj + ai[i]; 397224c233c2SKris Buschelman nz = diag[i] - ai[i]; 397324c233c2SKris Buschelman idx = 4*(*r++); 397424c233c2SKris Buschelman 397524c233c2SKris Buschelman /* Demote sum from double to float */ 397624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 397724c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 397824c233c2SKris Buschelman 397924c233c2SKris Buschelman while (nz--) { 398024c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 398124c233c2SKris Buschelman idx = 4*(*vi++); 398224c233c2SKris Buschelman 398324c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 398424c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 398524c233c2SKris Buschelman 398624c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 398724c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 398824c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 398924c233c2SKris Buschelman 399024c233c2SKris Buschelman /* First Column */ 399124c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 399224c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 399324c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 399424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 399524c233c2SKris Buschelman 399624c233c2SKris Buschelman /* Second Column */ 399724c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 399824c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 399924c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 400024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 400124c233c2SKris Buschelman 400224c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 400324c233c2SKris Buschelman 400424c233c2SKris Buschelman /* Third Column */ 400524c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 400624c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 400724c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 400824c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 400924c233c2SKris Buschelman 401024c233c2SKris Buschelman /* Fourth Column */ 401124c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 401224c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 401324c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 401424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 401524c233c2SKris Buschelman SSE_INLINE_END_2 401624c233c2SKris Buschelman 401724c233c2SKris Buschelman v += 16; 401824c233c2SKris Buschelman } 401924c233c2SKris Buschelman idx = 4*i; 402024c233c2SKris Buschelman v = aa + 16*ai[++i]; 402124c233c2SKris Buschelman PREFETCH_NTA(v); 402224c233c2SKris Buschelman STORE_PS(tmps,XMM7); 402324c233c2SKris Buschelman 402424c233c2SKris Buschelman /* Promote result from float to double */ 402524c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 402624c233c2SKris Buschelman } 402724c233c2SKris Buschelman /* backward solve the upper triangular */ 402824c233c2SKris Buschelman idt = 4*(n-1); 402924c233c2SKris Buschelman ai16 = 16*diag[n-1]; 403024c233c2SKris Buschelman v = aa + ai16 + 16; 403124c233c2SKris Buschelman for (i=n-1; i>=0;){ 403224c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 403324c233c2SKris Buschelman vi = aj + diag[i] + 1; 403424c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 403524c233c2SKris Buschelman 403624c233c2SKris Buschelman /* Demote accumulator from double to float */ 403724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 403824c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 403924c233c2SKris Buschelman 404024c233c2SKris Buschelman while (nz--) { 404124c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 404224c233c2SKris Buschelman idx = 4*(*vi++); 404324c233c2SKris Buschelman 404424c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 404524c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 404624c233c2SKris Buschelman 404724c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 404824c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 404924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 405024c233c2SKris Buschelman 405124c233c2SKris Buschelman /* First Column */ 405224c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 405324c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 405424c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 405524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 405624c233c2SKris Buschelman 405724c233c2SKris Buschelman /* Second Column */ 405824c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 405924c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 406024c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 406124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 406224c233c2SKris Buschelman 406324c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 406424c233c2SKris Buschelman 406524c233c2SKris Buschelman /* Third Column */ 406624c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 406724c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 406824c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 406924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 407024c233c2SKris Buschelman 407124c233c2SKris Buschelman /* Fourth Column */ 407224c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 407324c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 407424c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 407524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 407624c233c2SKris Buschelman SSE_INLINE_END_2 407724c233c2SKris Buschelman v += 16; 407824c233c2SKris Buschelman } 407924c233c2SKris Buschelman v = aa + ai16; 408024c233c2SKris Buschelman ai16 = 16*diag[--i]; 408124c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 408224c233c2SKris Buschelman /* 408324c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 408424c233c2SKris Buschelman which was inverted as part of the factorization 408524c233c2SKris Buschelman */ 408624c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 408724c233c2SKris Buschelman /* First Column */ 408824c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 408924c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 409024c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 409124c233c2SKris Buschelman 409224c233c2SKris Buschelman /* Second Column */ 409324c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 409424c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 409524c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 409624c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 409724c233c2SKris Buschelman 409824c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 409924c233c2SKris Buschelman 410024c233c2SKris Buschelman /* Third Column */ 410124c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 410224c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 410324c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 410424c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 410524c233c2SKris Buschelman 410624c233c2SKris Buschelman /* Fourth Column */ 410724c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 410824c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 410924c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 411024c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 411124c233c2SKris Buschelman 411224c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 411324c233c2SKris Buschelman SSE_INLINE_END_3 411424c233c2SKris Buschelman 411524c233c2SKris Buschelman /* Promote solution from float to double */ 411624c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 411724c233c2SKris Buschelman 411824c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 411924c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 412024c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 412124c233c2SKris Buschelman idc = 4*(*c--); 412224c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 412324c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 412424c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 412524c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 412624c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 412724c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 412824c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 412924c233c2SKris Buschelman SSE_INLINE_END_2 413024c233c2SKris Buschelman v = aa + ai16 + 16; 413124c233c2SKris Buschelman idt -= 4; 413224c233c2SKris Buschelman } 413324c233c2SKris Buschelman 413424c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 413524c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 41361ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 41371ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4138dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 413924c233c2SKris Buschelman SSE_SCOPE_END; 414024c233c2SKris Buschelman PetscFunctionReturn(0); 414124c233c2SKris Buschelman } 414224c233c2SKris Buschelman 414324c233c2SKris Buschelman #endif 41440ef38995SBarry Smith 41450ef38995SBarry Smith 41464e2b4712SSatish Balay /* 41474e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 41484e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 41494e2b4712SSatish Balay */ 41504a2ae208SSatish Balay #undef __FUNCT__ 415106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 415206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 41534e2b4712SSatish Balay { 41544e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4155356650c2SBarry Smith PetscInt n=a->mbs; 4156356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 4157dfbe8321SBarry Smith PetscErrorCode ierr; 4158356650c2SBarry Smith const PetscInt *diag = a->diag; 4159d9fead3dSBarry Smith const MatScalar *aa=a->a; 4160d9fead3dSBarry Smith PetscScalar *x; 4161d9fead3dSBarry Smith const PetscScalar *b; 41624e2b4712SSatish Balay 41634e2b4712SSatish Balay PetscFunctionBegin; 4164*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 41651ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 41664e2b4712SSatish Balay 4167aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 41682853dc0eSBarry Smith { 416987828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 41702853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 41712853dc0eSBarry Smith } 4172aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 41732853dc0eSBarry Smith { 417487828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 41752853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 41762853dc0eSBarry Smith } 4177aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 41782853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 4179e1293385SBarry Smith #else 418030d4dcafSBarry Smith { 418187828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4182d9fead3dSBarry Smith const MatScalar *v; 4183356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 4184356650c2SBarry Smith const PetscInt *vi; 4185e1293385SBarry Smith 41864e2b4712SSatish Balay /* forward solve the lower triangular */ 41874e2b4712SSatish Balay idx = 0; 4188e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 41894e2b4712SSatish Balay for (i=1; i<n; i++) { 41904e2b4712SSatish Balay v = aa + 16*ai[i]; 41914e2b4712SSatish Balay vi = aj + ai[i]; 41924e2b4712SSatish Balay nz = diag[i] - ai[i]; 4193e1293385SBarry Smith idx += 4; 4194f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 41954e2b4712SSatish Balay while (nz--) { 41964e2b4712SSatish Balay jdx = 4*(*vi++); 41974e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 4198f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4199f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4200f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4201f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 42024e2b4712SSatish Balay v += 16; 42034e2b4712SSatish Balay } 4204f1af5d2fSBarry Smith x[idx] = s1; 4205f1af5d2fSBarry Smith x[1+idx] = s2; 4206f1af5d2fSBarry Smith x[2+idx] = s3; 4207f1af5d2fSBarry Smith x[3+idx] = s4; 42084e2b4712SSatish Balay } 42094e2b4712SSatish Balay /* backward solve the upper triangular */ 42104e555682SBarry Smith idt = 4*(n-1); 42114e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 42124e555682SBarry Smith ai16 = 16*diag[i]; 42134e555682SBarry Smith v = aa + ai16 + 16; 42144e2b4712SSatish Balay vi = aj + diag[i] + 1; 42154e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4216f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4217f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 42184e2b4712SSatish Balay while (nz--) { 42194e2b4712SSatish Balay idx = 4*(*vi++); 42204e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 4221f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4222f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4223f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4224f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 42254e2b4712SSatish Balay v += 16; 42264e2b4712SSatish Balay } 42274e555682SBarry Smith v = aa + ai16; 4228f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4229f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4230f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4231f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4232329f5518SBarry Smith idt -= 4; 42334e2b4712SSatish Balay } 423430d4dcafSBarry Smith } 4235e1293385SBarry Smith #endif 42364e2b4712SSatish Balay 4237*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 42381ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4239dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 42404e2b4712SSatish Balay PetscFunctionReturn(0); 42414e2b4712SSatish Balay } 42424e2b4712SSatish Balay 4243b2b2dd24SShri Abhyankar #undef __FUNCT__ 42444dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 42454dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4246b2b2dd24SShri Abhyankar { 4247b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4248b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4249b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 4250b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4251b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4252b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4253b2b2dd24SShri Abhyankar PetscScalar *x; 4254b2b2dd24SShri Abhyankar const PetscScalar *b; 4255b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4256cee9d6f2SShri Abhyankar 4257b2b2dd24SShri Abhyankar PetscFunctionBegin; 4258*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4259b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4260b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4261b2b2dd24SShri Abhyankar idx = 0; 4262b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4263b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4264b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4265b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4266b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4267b2b2dd24SShri Abhyankar idx = bs*i; 4268b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4269b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 4270b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4271b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4272b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4273b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4274b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4275b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4276b2b2dd24SShri Abhyankar 4277b2b2dd24SShri Abhyankar v += bs2; 4278b2b2dd24SShri Abhyankar } 4279b2b2dd24SShri Abhyankar 4280b2b2dd24SShri Abhyankar x[idx] = s1; 4281b2b2dd24SShri Abhyankar x[1+idx] = s2; 4282b2b2dd24SShri Abhyankar x[2+idx] = s3; 4283b2b2dd24SShri Abhyankar x[3+idx] = s4; 4284b2b2dd24SShri Abhyankar } 4285b2b2dd24SShri Abhyankar 4286b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4287b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4288b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4289b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4290b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4291b2b2dd24SShri Abhyankar idt = bs*i; 4292b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4293b2b2dd24SShri Abhyankar 4294b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4295b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4296b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4297b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4298b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4299b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4300b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4301b2b2dd24SShri Abhyankar 4302b2b2dd24SShri Abhyankar v += bs2; 4303b2b2dd24SShri Abhyankar } 4304b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4305b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4306b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4307b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4308b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4309b2b2dd24SShri Abhyankar 4310b2b2dd24SShri Abhyankar } 4311b2b2dd24SShri Abhyankar 4312*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4313b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4314b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4315b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4316b2b2dd24SShri Abhyankar } 4317cee9d6f2SShri Abhyankar 4318cee9d6f2SShri Abhyankar #undef __FUNCT__ 4319f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4320dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4321f26ec98cSKris Buschelman { 4322f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4323b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 4324dfbe8321SBarry Smith PetscErrorCode ierr; 4325b3260449SShri Abhyankar const MatScalar *aa=a->a; 4326b3260449SShri Abhyankar const PetscScalar *b; 4327b3260449SShri Abhyankar PetscScalar *x; 4328f26ec98cSKris Buschelman 4329f26ec98cSKris Buschelman PetscFunctionBegin; 4330*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 43311ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4332f26ec98cSKris Buschelman 4333f26ec98cSKris Buschelman { 4334f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4335b3260449SShri Abhyankar const MatScalar *v; 4336b3260449SShri Abhyankar MatScalar *t=(MatScalar *)x; 4337b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i,ai16; 4338b3260449SShri Abhyankar const PetscInt *vi; 4339f26ec98cSKris Buschelman 4340f26ec98cSKris Buschelman /* forward solve the lower triangular */ 4341f26ec98cSKris Buschelman idx = 0; 4342f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 4343f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 4344f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 4345f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 4346f26ec98cSKris Buschelman for (i=1; i<n; i++) { 4347f26ec98cSKris Buschelman v = aa + 16*ai[i]; 4348f26ec98cSKris Buschelman vi = aj + ai[i]; 4349f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 4350f26ec98cSKris Buschelman idx += 4; 4351f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 4352f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 4353f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 4354f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 4355f26ec98cSKris Buschelman while (nz--) { 4356f26ec98cSKris Buschelman jdx = 4*(*vi++); 4357f26ec98cSKris Buschelman x1 = t[jdx]; 4358f26ec98cSKris Buschelman x2 = t[1+jdx]; 4359f26ec98cSKris Buschelman x3 = t[2+jdx]; 4360f26ec98cSKris Buschelman x4 = t[3+jdx]; 4361f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4362f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4363f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4364f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4365f26ec98cSKris Buschelman v += 16; 4366f26ec98cSKris Buschelman } 4367f26ec98cSKris Buschelman t[idx] = s1; 4368f26ec98cSKris Buschelman t[1+idx] = s2; 4369f26ec98cSKris Buschelman t[2+idx] = s3; 4370f26ec98cSKris Buschelman t[3+idx] = s4; 4371f26ec98cSKris Buschelman } 4372f26ec98cSKris Buschelman /* backward solve the upper triangular */ 4373f26ec98cSKris Buschelman idt = 4*(n-1); 4374f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 4375f26ec98cSKris Buschelman ai16 = 16*diag[i]; 4376f26ec98cSKris Buschelman v = aa + ai16 + 16; 4377f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 4378f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 4379f26ec98cSKris Buschelman s1 = t[idt]; 4380f26ec98cSKris Buschelman s2 = t[1+idt]; 4381f26ec98cSKris Buschelman s3 = t[2+idt]; 4382f26ec98cSKris Buschelman s4 = t[3+idt]; 4383f26ec98cSKris Buschelman while (nz--) { 4384f26ec98cSKris Buschelman idx = 4*(*vi++); 4385f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 4386f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 4387f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 4388f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 4389f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4390f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4391f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4392f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4393f26ec98cSKris Buschelman v += 16; 4394f26ec98cSKris Buschelman } 4395f26ec98cSKris Buschelman v = aa + ai16; 4396f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4397f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4398f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4399f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4400f26ec98cSKris Buschelman idt -= 4; 4401f26ec98cSKris Buschelman } 4402f26ec98cSKris Buschelman } 4403f26ec98cSKris Buschelman 4404*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 44051ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4406dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4407f26ec98cSKris Buschelman PetscFunctionReturn(0); 4408f26ec98cSKris Buschelman } 4409f26ec98cSKris Buschelman 44103660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 44113660e330SKris Buschelman 44123660e330SKris Buschelman #include PETSC_HAVE_SSE 44133660e330SKris Buschelman #undef __FUNCT__ 44147cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4415dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 44163660e330SKris Buschelman { 44173660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 44182aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 4419dfbe8321SBarry Smith PetscErrorCode ierr; 4420dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 44213660e330SKris Buschelman MatScalar *aa=a->a; 442287828ca2SBarry Smith PetscScalar *x,*b; 44233660e330SKris Buschelman 44243660e330SKris Buschelman PetscFunctionBegin; 44253660e330SKris Buschelman SSE_SCOPE_BEGIN; 44263660e330SKris Buschelman /* 44273660e330SKris Buschelman Note: This code currently uses demotion of double 44283660e330SKris Buschelman to float when performing the mixed-mode computation. 44293660e330SKris Buschelman This may not be numerically reasonable for all applications. 44303660e330SKris Buschelman */ 44313660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 44323660e330SKris Buschelman 44331ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 44341ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 44353660e330SKris Buschelman { 4436eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 4437eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 44382aa5897fSKris Buschelman int nz,i,idt,ai16; 44392aa5897fSKris Buschelman unsigned int jdx,idx; 44402aa5897fSKris Buschelman unsigned short *vi; 4441eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 44423660e330SKris Buschelman 4443eb05f457SKris Buschelman /* First block is the identity. */ 44443660e330SKris Buschelman idx = 0; 4445eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 44462aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 44473660e330SKris Buschelman 44483660e330SKris Buschelman for (i=1; i<n;) { 44493660e330SKris Buschelman PREFETCH_NTA(&v[8]); 44503660e330SKris Buschelman vi = aj + ai[i]; 44513660e330SKris Buschelman nz = diag[i] - ai[i]; 44523660e330SKris Buschelman idx += 4; 44533660e330SKris Buschelman 4454eb05f457SKris Buschelman /* Demote RHS from double to float. */ 4455eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4456eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 44573660e330SKris Buschelman 44583660e330SKris Buschelman while (nz--) { 44593660e330SKris Buschelman PREFETCH_NTA(&v[16]); 44602aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 44613660e330SKris Buschelman 44623660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 4463eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 44643660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 44653660e330SKris Buschelman 44663660e330SKris Buschelman /* First Column */ 44673660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 44683660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 44693660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 44703660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 44713660e330SKris Buschelman 44723660e330SKris Buschelman /* Second Column */ 44733660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 44743660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 44753660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 44763660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 44773660e330SKris Buschelman 44783660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 44793660e330SKris Buschelman 44803660e330SKris Buschelman /* Third Column */ 44813660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 44823660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 44833660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 44843660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 44853660e330SKris Buschelman 44863660e330SKris Buschelman /* Fourth Column */ 44873660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 44883660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 44893660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 44903660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 44913660e330SKris Buschelman SSE_INLINE_END_2 44923660e330SKris Buschelman 44933660e330SKris Buschelman v += 16; 44943660e330SKris Buschelman } 44953660e330SKris Buschelman v = aa + 16*ai[++i]; 44963660e330SKris Buschelman PREFETCH_NTA(v); 4497eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 44983660e330SKris Buschelman } 4499eb05f457SKris Buschelman 4500eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 4501eb05f457SKris Buschelman 45023660e330SKris Buschelman idt = 4*(n-1); 45033660e330SKris Buschelman ai16 = 16*diag[n-1]; 45043660e330SKris Buschelman v = aa + ai16 + 16; 45053660e330SKris Buschelman for (i=n-1; i>=0;){ 45063660e330SKris Buschelman PREFETCH_NTA(&v[8]); 45073660e330SKris Buschelman vi = aj + diag[i] + 1; 45083660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 45093660e330SKris Buschelman 4510eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 45113660e330SKris Buschelman 45123660e330SKris Buschelman while (nz--) { 45133660e330SKris Buschelman PREFETCH_NTA(&v[16]); 45142aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 45153660e330SKris Buschelman 45163660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 4517eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 45183660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 45193660e330SKris Buschelman 45203660e330SKris Buschelman /* First Column */ 45213660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 45223660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 45233660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 45243660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 45253660e330SKris Buschelman 45263660e330SKris Buschelman /* Second Column */ 45273660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 45283660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 45293660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 45303660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 45313660e330SKris Buschelman 45323660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 45333660e330SKris Buschelman 45343660e330SKris Buschelman /* Third Column */ 45353660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 45363660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 45373660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 45383660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 45393660e330SKris Buschelman 45403660e330SKris Buschelman /* Fourth Column */ 45413660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 45423660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 45433660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 45443660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 45453660e330SKris Buschelman SSE_INLINE_END_2 45463660e330SKris Buschelman v += 16; 45473660e330SKris Buschelman } 45483660e330SKris Buschelman v = aa + ai16; 45493660e330SKris Buschelman ai16 = 16*diag[--i]; 45503660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 45513660e330SKris Buschelman /* 45523660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 45533660e330SKris Buschelman which was inverted as part of the factorization 45543660e330SKris Buschelman */ 4555eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 45563660e330SKris Buschelman /* First Column */ 45573660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 45583660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 45593660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 45603660e330SKris Buschelman 45613660e330SKris Buschelman /* Second Column */ 45623660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 45633660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 45643660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 45653660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 45663660e330SKris Buschelman 45673660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 45683660e330SKris Buschelman 45693660e330SKris Buschelman /* Third Column */ 45703660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 45713660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 45723660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 45733660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 45743660e330SKris Buschelman 45753660e330SKris Buschelman /* Fourth Column */ 45763660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 45773660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 45783660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 45793660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 45803660e330SKris Buschelman 45813660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 45823660e330SKris Buschelman SSE_INLINE_END_3 45833660e330SKris Buschelman 45843660e330SKris Buschelman v = aa + ai16 + 16; 45853660e330SKris Buschelman idt -= 4; 45863660e330SKris Buschelman } 4587eb05f457SKris Buschelman 4588eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 4589eb05f457SKris Buschelman idt = 4*(n-1); 4590eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 4591eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4592eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4593eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 4594eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 4595eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 4596eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 4597eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 4598eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 459954693613SKris Buschelman idt -= 4; 46003660e330SKris Buschelman } 4601eb05f457SKris Buschelman 4602eb05f457SKris Buschelman } /* End of artificial scope. */ 46031ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 46041ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4605dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 46063660e330SKris Buschelman SSE_SCOPE_END; 46073660e330SKris Buschelman PetscFunctionReturn(0); 46083660e330SKris Buschelman } 46093660e330SKris Buschelman 46107cf1b8d3SKris Buschelman #undef __FUNCT__ 46117cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4612dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 46137cf1b8d3SKris Buschelman { 46147cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 46157cf1b8d3SKris Buschelman int *aj=a->j; 4616dfbe8321SBarry Smith PetscErrorCode ierr; 4617dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 46187cf1b8d3SKris Buschelman MatScalar *aa=a->a; 46197cf1b8d3SKris Buschelman PetscScalar *x,*b; 46207cf1b8d3SKris Buschelman 46217cf1b8d3SKris Buschelman PetscFunctionBegin; 46227cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 46237cf1b8d3SKris Buschelman /* 46247cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 46257cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 46267cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 46277cf1b8d3SKris Buschelman */ 46287cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 46297cf1b8d3SKris Buschelman 46301ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 46311ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 46327cf1b8d3SKris Buschelman { 46337cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 46347cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 46357cf1b8d3SKris Buschelman int nz,i,idt,ai16; 46367cf1b8d3SKris Buschelman int jdx,idx; 46377cf1b8d3SKris Buschelman int *vi; 46387cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 46397cf1b8d3SKris Buschelman 46407cf1b8d3SKris Buschelman /* First block is the identity. */ 46417cf1b8d3SKris Buschelman idx = 0; 46427cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 46437cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 46447cf1b8d3SKris Buschelman 46457cf1b8d3SKris Buschelman for (i=1; i<n;) { 46467cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 46477cf1b8d3SKris Buschelman vi = aj + ai[i]; 46487cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 46497cf1b8d3SKris Buschelman idx += 4; 46507cf1b8d3SKris Buschelman 46517cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 46527cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 46537cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 46547cf1b8d3SKris Buschelman 46557cf1b8d3SKris Buschelman while (nz--) { 46567cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 46577cf1b8d3SKris Buschelman jdx = 4*(*vi++); 46587cf1b8d3SKris Buschelman /* jdx = *vi++; */ 46597cf1b8d3SKris Buschelman 46607cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 46617cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 46627cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 46637cf1b8d3SKris Buschelman 46647cf1b8d3SKris Buschelman /* First Column */ 46657cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 46667cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 46677cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 46687cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 46697cf1b8d3SKris Buschelman 46707cf1b8d3SKris Buschelman /* Second Column */ 46717cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 46727cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 46737cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 46747cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 46757cf1b8d3SKris Buschelman 46767cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 46777cf1b8d3SKris Buschelman 46787cf1b8d3SKris Buschelman /* Third Column */ 46797cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 46807cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 46817cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 46827cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 46837cf1b8d3SKris Buschelman 46847cf1b8d3SKris Buschelman /* Fourth Column */ 46857cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 46867cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 46877cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 46887cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 46897cf1b8d3SKris Buschelman SSE_INLINE_END_2 46907cf1b8d3SKris Buschelman 46917cf1b8d3SKris Buschelman v += 16; 46927cf1b8d3SKris Buschelman } 46937cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 46947cf1b8d3SKris Buschelman PREFETCH_NTA(v); 46957cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 46967cf1b8d3SKris Buschelman } 46977cf1b8d3SKris Buschelman 46987cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 46997cf1b8d3SKris Buschelman 47007cf1b8d3SKris Buschelman idt = 4*(n-1); 47017cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 47027cf1b8d3SKris Buschelman v = aa + ai16 + 16; 47037cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 47047cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 47057cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 47067cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 47077cf1b8d3SKris Buschelman 47087cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 47097cf1b8d3SKris Buschelman 47107cf1b8d3SKris Buschelman while (nz--) { 47117cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 47127cf1b8d3SKris Buschelman idx = 4*(*vi++); 47137cf1b8d3SKris Buschelman /* idx = *vi++; */ 47147cf1b8d3SKris Buschelman 47157cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 47167cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 47177cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 47187cf1b8d3SKris Buschelman 47197cf1b8d3SKris Buschelman /* First Column */ 47207cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 47217cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 47227cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 47237cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 47247cf1b8d3SKris Buschelman 47257cf1b8d3SKris Buschelman /* Second Column */ 47267cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 47277cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 47287cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 47297cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 47307cf1b8d3SKris Buschelman 47317cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 47327cf1b8d3SKris Buschelman 47337cf1b8d3SKris Buschelman /* Third Column */ 47347cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 47357cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 47367cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 47377cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 47387cf1b8d3SKris Buschelman 47397cf1b8d3SKris Buschelman /* Fourth Column */ 47407cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 47417cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 47427cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 47437cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 47447cf1b8d3SKris Buschelman SSE_INLINE_END_2 47457cf1b8d3SKris Buschelman v += 16; 47467cf1b8d3SKris Buschelman } 47477cf1b8d3SKris Buschelman v = aa + ai16; 47487cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 47497cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 47507cf1b8d3SKris Buschelman /* 47517cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 47527cf1b8d3SKris Buschelman which was inverted as part of the factorization 47537cf1b8d3SKris Buschelman */ 47547cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 47557cf1b8d3SKris Buschelman /* First Column */ 47567cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 47577cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 47587cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 47597cf1b8d3SKris Buschelman 47607cf1b8d3SKris Buschelman /* Second Column */ 47617cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 47627cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 47637cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 47647cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 47657cf1b8d3SKris Buschelman 47667cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 47677cf1b8d3SKris Buschelman 47687cf1b8d3SKris Buschelman /* Third Column */ 47697cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 47707cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 47717cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 47727cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 47737cf1b8d3SKris Buschelman 47747cf1b8d3SKris Buschelman /* Fourth Column */ 47757cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 47767cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 47777cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 47787cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 47797cf1b8d3SKris Buschelman 47807cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 47817cf1b8d3SKris Buschelman SSE_INLINE_END_3 47827cf1b8d3SKris Buschelman 47837cf1b8d3SKris Buschelman v = aa + ai16 + 16; 47847cf1b8d3SKris Buschelman idt -= 4; 47857cf1b8d3SKris Buschelman } 47867cf1b8d3SKris Buschelman 47877cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 47887cf1b8d3SKris Buschelman idt = 4*(n-1); 47897cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 47907cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 47917cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 47927cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 47937cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 47947cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 47957cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 47967cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 47977cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 47987cf1b8d3SKris Buschelman idt -= 4; 47997cf1b8d3SKris Buschelman } 48007cf1b8d3SKris Buschelman 48017cf1b8d3SKris Buschelman } /* End of artificial scope. */ 48021ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 48031ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4804dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 48057cf1b8d3SKris Buschelman SSE_SCOPE_END; 48067cf1b8d3SKris Buschelman PetscFunctionReturn(0); 48077cf1b8d3SKris Buschelman } 48087cf1b8d3SKris Buschelman 48093660e330SKris Buschelman #endif 48108f690400SShri Abhyankar 48114a2ae208SSatish Balay #undef __FUNCT__ 481206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 481306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 48144e2b4712SSatish Balay { 48154e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 48164e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 48176849ba73SBarry Smith PetscErrorCode ierr; 4818b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 4819b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 48205d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4821d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4822d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4823d9fead3dSBarry Smith const PetscScalar *b; 48244e2b4712SSatish Balay 48254e2b4712SSatish Balay PetscFunctionBegin; 4826*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 48271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4828f1af5d2fSBarry Smith t = a->solve_work; 48294e2b4712SSatish Balay 48304e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 48314e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 48324e2b4712SSatish Balay 48334e2b4712SSatish Balay /* forward solve the lower triangular */ 48344e2b4712SSatish Balay idx = 3*(*r++); 4835f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 48364e2b4712SSatish Balay for (i=1; i<n; i++) { 48374e2b4712SSatish Balay v = aa + 9*ai[i]; 48384e2b4712SSatish Balay vi = aj + ai[i]; 48394e2b4712SSatish Balay nz = diag[i] - ai[i]; 48404e2b4712SSatish Balay idx = 3*(*r++); 4841f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 48424e2b4712SSatish Balay while (nz--) { 48434e2b4712SSatish Balay idx = 3*(*vi++); 4844f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4845f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4846f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4847f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48484e2b4712SSatish Balay v += 9; 48494e2b4712SSatish Balay } 48504e2b4712SSatish Balay idx = 3*i; 4851f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 48524e2b4712SSatish Balay } 48534e2b4712SSatish Balay /* backward solve the upper triangular */ 48544e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 48554e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 48564e2b4712SSatish Balay vi = aj + diag[i] + 1; 48574e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 48584e2b4712SSatish Balay idt = 3*i; 4859f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 48604e2b4712SSatish Balay while (nz--) { 48614e2b4712SSatish Balay idx = 3*(*vi++); 4862f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4863f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4864f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4865f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48664e2b4712SSatish Balay v += 9; 48674e2b4712SSatish Balay } 48684e2b4712SSatish Balay idc = 3*(*c--); 48694e2b4712SSatish Balay v = aa + 9*diag[i]; 4870f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4871f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4872f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 48734e2b4712SSatish Balay } 48744e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 48754e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4876*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 48771ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4878dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 48794e2b4712SSatish Balay PetscFunctionReturn(0); 48804e2b4712SSatish Balay } 48814e2b4712SSatish Balay 48820c4413a7SShri Abhyankar #undef __FUNCT__ 48834dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3" 48844dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 48850c4413a7SShri Abhyankar { 48860c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 48870c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 48880c4413a7SShri Abhyankar PetscErrorCode ierr; 4889b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4890b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 48910c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 48920c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 48930c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 48940c4413a7SShri Abhyankar const PetscScalar *b; 48950c4413a7SShri Abhyankar 48960c4413a7SShri Abhyankar PetscFunctionBegin; 4897*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 48980c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 48990c4413a7SShri Abhyankar t = a->solve_work; 49000c4413a7SShri Abhyankar 49010c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 49020c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 49030c4413a7SShri Abhyankar 49040c4413a7SShri Abhyankar /* forward solve the lower triangular */ 49050c4413a7SShri Abhyankar idx = 3*r[0]; 49060c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 49070c4413a7SShri Abhyankar for (i=1; i<n; i++) { 49080c4413a7SShri Abhyankar v = aa + 9*ai[i]; 49090c4413a7SShri Abhyankar vi = aj + ai[i]; 49100c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 49110c4413a7SShri Abhyankar idx = 3*r[i]; 49120c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 49130c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 49140c4413a7SShri Abhyankar idx = 3*vi[m]; 49150c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 49160c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 49170c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 49180c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 49190c4413a7SShri Abhyankar v += 9; 49200c4413a7SShri Abhyankar } 49210c4413a7SShri Abhyankar idx = 3*i; 49220c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 49230c4413a7SShri Abhyankar } 49240c4413a7SShri Abhyankar /* backward solve the upper triangular */ 49250c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 49260c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 49270c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 49280c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 49290c4413a7SShri Abhyankar idt = 3*i; 49300c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 49310c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 49320c4413a7SShri Abhyankar idx = 3*vi[m]; 49330c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 49340c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 49350c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 49360c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 49370c4413a7SShri Abhyankar v += 9; 49380c4413a7SShri Abhyankar } 49390c4413a7SShri Abhyankar idc = 3*c[i]; 49400c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 49410c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 49420c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 49430c4413a7SShri Abhyankar } 49440c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 49450c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4946*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 49470c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 49480c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 49490c4413a7SShri Abhyankar PetscFunctionReturn(0); 49500c4413a7SShri Abhyankar } 49510c4413a7SShri Abhyankar 495215091d37SBarry Smith /* 495315091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 495415091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 495515091d37SBarry Smith */ 49564a2ae208SSatish Balay #undef __FUNCT__ 495706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 495806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 495915091d37SBarry Smith { 496015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 49610b68f018SBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4962dfbe8321SBarry Smith PetscErrorCode ierr; 49630b68f018SBarry Smith const PetscInt *diag = a->diag,*vi; 4964d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4965d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 4966d9fead3dSBarry Smith const PetscScalar *b; 49670b68f018SBarry Smith PetscInt jdx,idt,idx,nz,i; 496815091d37SBarry Smith 496915091d37SBarry Smith PetscFunctionBegin; 4970*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 49711ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 497215091d37SBarry Smith 497315091d37SBarry Smith /* forward solve the lower triangular */ 497415091d37SBarry Smith idx = 0; 497515091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 497615091d37SBarry Smith for (i=1; i<n; i++) { 497715091d37SBarry Smith v = aa + 9*ai[i]; 497815091d37SBarry Smith vi = aj + ai[i]; 497915091d37SBarry Smith nz = diag[i] - ai[i]; 498015091d37SBarry Smith idx += 3; 4981f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 498215091d37SBarry Smith while (nz--) { 498315091d37SBarry Smith jdx = 3*(*vi++); 498415091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4985f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4986f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4987f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 498815091d37SBarry Smith v += 9; 498915091d37SBarry Smith } 4990f1af5d2fSBarry Smith x[idx] = s1; 4991f1af5d2fSBarry Smith x[1+idx] = s2; 4992f1af5d2fSBarry Smith x[2+idx] = s3; 499315091d37SBarry Smith } 499415091d37SBarry Smith /* backward solve the upper triangular */ 499515091d37SBarry Smith for (i=n-1; i>=0; i--){ 499615091d37SBarry Smith v = aa + 9*diag[i] + 9; 499715091d37SBarry Smith vi = aj + diag[i] + 1; 499815091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 499915091d37SBarry Smith idt = 3*i; 5000f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 5001f1af5d2fSBarry Smith s3 = x[2+idt]; 500215091d37SBarry Smith while (nz--) { 500315091d37SBarry Smith idx = 3*(*vi++); 500415091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 5005f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5006f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5007f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 500815091d37SBarry Smith v += 9; 500915091d37SBarry Smith } 501015091d37SBarry Smith v = aa + 9*diag[i]; 5011f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5012f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5013f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 501415091d37SBarry Smith } 501515091d37SBarry Smith 5016*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 50171ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5018dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 501915091d37SBarry Smith PetscFunctionReturn(0); 502015091d37SBarry Smith } 502115091d37SBarry Smith 5022cee9d6f2SShri Abhyankar #undef __FUNCT__ 50234dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 50244dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 5025b2b2dd24SShri Abhyankar { 5026b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5027b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5028b2b2dd24SShri Abhyankar PetscErrorCode ierr; 5029b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 5030b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 5031b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 5032b2b2dd24SShri Abhyankar PetscScalar *x; 5033b2b2dd24SShri Abhyankar const PetscScalar *b; 5034b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 5035b2b2dd24SShri Abhyankar 5036b2b2dd24SShri Abhyankar PetscFunctionBegin; 5037*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5038b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5039b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 5040b2b2dd24SShri Abhyankar idx = 0; 5041b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 5042b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 5043b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 5044b2b2dd24SShri Abhyankar vi = aj + ai[i]; 5045b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 5046b2b2dd24SShri Abhyankar idx = bs*i; 5047b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 5048b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 5049b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 5050b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 5051b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5052b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5053b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5054b2b2dd24SShri Abhyankar 5055b2b2dd24SShri Abhyankar v += bs2; 5056b2b2dd24SShri Abhyankar } 5057b2b2dd24SShri Abhyankar 5058b2b2dd24SShri Abhyankar x[idx] = s1; 5059b2b2dd24SShri Abhyankar x[1+idx] = s2; 5060b2b2dd24SShri Abhyankar x[2+idx] = s3; 5061b2b2dd24SShri Abhyankar } 5062b2b2dd24SShri Abhyankar 5063b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 5064b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 5065b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 5066b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 5067b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5068b2b2dd24SShri Abhyankar idt = bs*i; 5069b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 5070b2b2dd24SShri Abhyankar 5071b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 5072b2b2dd24SShri Abhyankar idx = bs*vi[k]; 5073b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 5074b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5075b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5076b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5077b2b2dd24SShri Abhyankar 5078b2b2dd24SShri Abhyankar v += bs2; 5079b2b2dd24SShri Abhyankar } 5080b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 5081b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5082b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5083b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 5084b2b2dd24SShri Abhyankar 5085b2b2dd24SShri Abhyankar } 5086b2b2dd24SShri Abhyankar 5087*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5088b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5089b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5090b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 5091b2b2dd24SShri Abhyankar } 5092b2b2dd24SShri Abhyankar 5093b2b2dd24SShri Abhyankar #undef __FUNCT__ 509406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 509506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 50964e2b4712SSatish Balay { 50974e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 50984e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 50996849ba73SBarry Smith PetscErrorCode ierr; 5100b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5101b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 51025d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5103d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 5104d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 5105d9fead3dSBarry Smith const PetscScalar *b; 51064e2b4712SSatish Balay 51074e2b4712SSatish Balay PetscFunctionBegin; 5108*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 51091ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5110f1af5d2fSBarry Smith t = a->solve_work; 51114e2b4712SSatish Balay 51124e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 51134e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 51144e2b4712SSatish Balay 51154e2b4712SSatish Balay /* forward solve the lower triangular */ 51164e2b4712SSatish Balay idx = 2*(*r++); 5117f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 51184e2b4712SSatish Balay for (i=1; i<n; i++) { 51194e2b4712SSatish Balay v = aa + 4*ai[i]; 51204e2b4712SSatish Balay vi = aj + ai[i]; 51214e2b4712SSatish Balay nz = diag[i] - ai[i]; 51224e2b4712SSatish Balay idx = 2*(*r++); 5123f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 51244e2b4712SSatish Balay while (nz--) { 51254e2b4712SSatish Balay idx = 2*(*vi++); 5126f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 5127f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5128f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 51294e2b4712SSatish Balay v += 4; 51304e2b4712SSatish Balay } 51314e2b4712SSatish Balay idx = 2*i; 5132f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 51334e2b4712SSatish Balay } 51344e2b4712SSatish Balay /* backward solve the upper triangular */ 51354e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 51364e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 51374e2b4712SSatish Balay vi = aj + diag[i] + 1; 51384e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 51394e2b4712SSatish Balay idt = 2*i; 5140f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 51414e2b4712SSatish Balay while (nz--) { 51424e2b4712SSatish Balay idx = 2*(*vi++); 5143f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 5144f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5145f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 51464e2b4712SSatish Balay v += 4; 51474e2b4712SSatish Balay } 51484e2b4712SSatish Balay idc = 2*(*c--); 51494e2b4712SSatish Balay v = aa + 4*diag[i]; 5150f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5151f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 51524e2b4712SSatish Balay } 51534e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 51544e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5155*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 51561ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5157dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 51584e2b4712SSatish Balay PetscFunctionReturn(0); 51594e2b4712SSatish Balay } 51604e2b4712SSatish Balay 51610c4413a7SShri Abhyankar #undef __FUNCT__ 51624dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2" 51634dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 51640c4413a7SShri Abhyankar { 51650c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 51660c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 51670c4413a7SShri Abhyankar PetscErrorCode ierr; 5168b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5169b3260449SShri Abhyankar PetscInt i,nz,idx,jdx,idt,idc,m; 51700c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 51710c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 51720c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 51730c4413a7SShri Abhyankar const PetscScalar *b; 51740c4413a7SShri Abhyankar 51750c4413a7SShri Abhyankar PetscFunctionBegin; 5176*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 51770c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 51780c4413a7SShri Abhyankar t = a->solve_work; 51790c4413a7SShri Abhyankar 51800c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 51810c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 51820c4413a7SShri Abhyankar 51830c4413a7SShri Abhyankar /* forward solve the lower triangular */ 51840c4413a7SShri Abhyankar idx = 2*r[0]; 51850c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 51860c4413a7SShri Abhyankar for (i=1; i<n; i++) { 51870c4413a7SShri Abhyankar v = aa + 4*ai[i]; 51880c4413a7SShri Abhyankar vi = aj + ai[i]; 51890c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 51900c4413a7SShri Abhyankar idx = 2*r[i]; 51910c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 51920c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 51930c4413a7SShri Abhyankar jdx = 2*vi[m]; 51940c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 51950c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 51960c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 51970c4413a7SShri Abhyankar v += 4; 51980c4413a7SShri Abhyankar } 51990c4413a7SShri Abhyankar idx = 2*i; 52000c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 52010c4413a7SShri Abhyankar } 52020c4413a7SShri Abhyankar /* backward solve the upper triangular */ 52030c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 52040c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 52050c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 52060c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 52070c4413a7SShri Abhyankar idt = 2*i; 52080c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 52090c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 52100c4413a7SShri Abhyankar idx = 2*vi[m]; 52110c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 52120c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 52130c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 52140c4413a7SShri Abhyankar v += 4; 52150c4413a7SShri Abhyankar } 52160c4413a7SShri Abhyankar idc = 2*c[i]; 52170c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 52180c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 52190c4413a7SShri Abhyankar } 52200c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 52210c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5222*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 52230c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52240c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 52250c4413a7SShri Abhyankar PetscFunctionReturn(0); 52260c4413a7SShri Abhyankar } 52278f690400SShri Abhyankar 522815091d37SBarry Smith /* 522915091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 523015091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 523115091d37SBarry Smith */ 52324a2ae208SSatish Balay #undef __FUNCT__ 523306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 523406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 523515091d37SBarry Smith { 523615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5237b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5238dfbe8321SBarry Smith PetscErrorCode ierr; 5239d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 5240d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 5241d9fead3dSBarry Smith const PetscScalar *b; 5242b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i; 524315091d37SBarry Smith 524415091d37SBarry Smith PetscFunctionBegin; 5245*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 52461ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 524715091d37SBarry Smith 524815091d37SBarry Smith /* forward solve the lower triangular */ 524915091d37SBarry Smith idx = 0; 525015091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 525115091d37SBarry Smith for (i=1; i<n; i++) { 525215091d37SBarry Smith v = aa + 4*ai[i]; 525315091d37SBarry Smith vi = aj + ai[i]; 525415091d37SBarry Smith nz = diag[i] - ai[i]; 525515091d37SBarry Smith idx += 2; 5256f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 525715091d37SBarry Smith while (nz--) { 525815091d37SBarry Smith jdx = 2*(*vi++); 525915091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 5260f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5261f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 526215091d37SBarry Smith v += 4; 526315091d37SBarry Smith } 5264f1af5d2fSBarry Smith x[idx] = s1; 5265f1af5d2fSBarry Smith x[1+idx] = s2; 526615091d37SBarry Smith } 526715091d37SBarry Smith /* backward solve the upper triangular */ 526815091d37SBarry Smith for (i=n-1; i>=0; i--){ 526915091d37SBarry Smith v = aa + 4*diag[i] + 4; 527015091d37SBarry Smith vi = aj + diag[i] + 1; 527115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 527215091d37SBarry Smith idt = 2*i; 5273f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 527415091d37SBarry Smith while (nz--) { 527515091d37SBarry Smith idx = 2*(*vi++); 527615091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 5277f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5278f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 527915091d37SBarry Smith v += 4; 528015091d37SBarry Smith } 528115091d37SBarry Smith v = aa + 4*diag[i]; 5282f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 5283f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 528415091d37SBarry Smith } 528515091d37SBarry Smith 5286*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 52871ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5288dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 528915091d37SBarry Smith PetscFunctionReturn(0); 529015091d37SBarry Smith } 529115091d37SBarry Smith 5292cee9d6f2SShri Abhyankar #undef __FUNCT__ 52934dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 52944dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5295b2b2dd24SShri Abhyankar { 5296b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5297b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5298b3260449SShri Abhyankar PetscInt i,k,nz,idx,idt,jdx; 5299b2b2dd24SShri Abhyankar PetscErrorCode ierr; 5300b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 5301b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 5302b2b2dd24SShri Abhyankar const PetscScalar *b; 5303b2b2dd24SShri Abhyankar 5304b2b2dd24SShri Abhyankar PetscFunctionBegin; 5305*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5306b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5307b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 5308b2b2dd24SShri Abhyankar idx = 0; 5309b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 5310b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 5311b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 5312b2b2dd24SShri Abhyankar vi = aj + ai[i]; 5313b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 5314b2b2dd24SShri Abhyankar idx = 2*i; 5315b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 5316b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 5317b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 5318b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 5319b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 5320b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 5321b2b2dd24SShri Abhyankar v += 4; 5322b2b2dd24SShri Abhyankar } 5323b2b2dd24SShri Abhyankar x[idx] = s1; 5324b2b2dd24SShri Abhyankar x[1+idx] = s2; 5325b2b2dd24SShri Abhyankar } 5326b2b2dd24SShri Abhyankar 5327b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 5328b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 5329b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 5330b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 5331b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5332b2b2dd24SShri Abhyankar idt = 2*i; 5333b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 5334b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 5335b2b2dd24SShri Abhyankar idx = 2*vi[k]; 5336b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 5337b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 5338b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 5339b2b2dd24SShri Abhyankar v += 4; 5340b2b2dd24SShri Abhyankar } 5341b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 5342b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 5343b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 5344b2b2dd24SShri Abhyankar } 5345b2b2dd24SShri Abhyankar 5346*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5347b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5348b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5349b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 5350b2b2dd24SShri Abhyankar } 5351b2b2dd24SShri Abhyankar 5352b2b2dd24SShri Abhyankar #undef __FUNCT__ 535306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 535406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 53554e2b4712SSatish Balay { 53564e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 53574e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 53586849ba73SBarry Smith PetscErrorCode ierr; 5359b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5360b3260449SShri Abhyankar PetscInt i,nz; 53615d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5362b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 5363b3260449SShri Abhyankar PetscScalar *x,s1,*t; 5364b3260449SShri Abhyankar const PetscScalar *b; 53654e2b4712SSatish Balay 53664e2b4712SSatish Balay PetscFunctionBegin; 53674e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 53684e2b4712SSatish Balay 5369*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 53701ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5371f1af5d2fSBarry Smith t = a->solve_work; 53724e2b4712SSatish Balay 53734e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 53744e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 53754e2b4712SSatish Balay 53764e2b4712SSatish Balay /* forward solve the lower triangular */ 5377f1af5d2fSBarry Smith t[0] = b[*r++]; 53784e2b4712SSatish Balay for (i=1; i<n; i++) { 53794e2b4712SSatish Balay v = aa + ai[i]; 53804e2b4712SSatish Balay vi = aj + ai[i]; 53814e2b4712SSatish Balay nz = diag[i] - ai[i]; 5382f1af5d2fSBarry Smith s1 = b[*r++]; 53834e2b4712SSatish Balay while (nz--) { 5384f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 53854e2b4712SSatish Balay } 5386f1af5d2fSBarry Smith t[i] = s1; 53874e2b4712SSatish Balay } 53884e2b4712SSatish Balay /* backward solve the upper triangular */ 53894e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 53904e2b4712SSatish Balay v = aa + diag[i] + 1; 53914e2b4712SSatish Balay vi = aj + diag[i] + 1; 53924e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 5393f1af5d2fSBarry Smith s1 = t[i]; 53944e2b4712SSatish Balay while (nz--) { 5395f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 53964e2b4712SSatish Balay } 5397f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 53984e2b4712SSatish Balay } 53994e2b4712SSatish Balay 54004e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 54014e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5402*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 54031ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5404dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 54054e2b4712SSatish Balay PetscFunctionReturn(0); 54064e2b4712SSatish Balay } 5407048b5e81SShri Abhyankar 5408048b5e81SShri Abhyankar #undef __FUNCT__ 5409048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1" 5410048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 5411048b5e81SShri Abhyankar { 5412048b5e81SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5413048b5e81SShri Abhyankar IS iscol = a->col,isrow = a->row; 5414048b5e81SShri Abhyankar PetscErrorCode ierr; 5415048b5e81SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 5416048b5e81SShri Abhyankar const PetscInt *rout,*cout,*r,*c; 5417048b5e81SShri Abhyankar PetscScalar *x,*tmp,sum; 5418048b5e81SShri Abhyankar const PetscScalar *b; 5419048b5e81SShri Abhyankar const MatScalar *aa = a->a,*v; 5420048b5e81SShri Abhyankar 5421048b5e81SShri Abhyankar PetscFunctionBegin; 5422048b5e81SShri Abhyankar if (!n) PetscFunctionReturn(0); 5423048b5e81SShri Abhyankar 5424*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5425048b5e81SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5426048b5e81SShri Abhyankar tmp = a->solve_work; 5427048b5e81SShri Abhyankar 5428048b5e81SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5429048b5e81SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5430048b5e81SShri Abhyankar 5431048b5e81SShri Abhyankar /* forward solve the lower triangular */ 5432048b5e81SShri Abhyankar tmp[0] = b[r[0]]; 5433048b5e81SShri Abhyankar v = aa; 5434048b5e81SShri Abhyankar vi = aj; 5435048b5e81SShri Abhyankar for (i=1; i<n; i++) { 5436048b5e81SShri Abhyankar nz = ai[i+1] - ai[i]; 5437048b5e81SShri Abhyankar sum = b[r[i]]; 5438048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5439048b5e81SShri Abhyankar tmp[i] = sum; 5440048b5e81SShri Abhyankar v += nz; vi += nz; 5441048b5e81SShri Abhyankar } 5442048b5e81SShri Abhyankar 5443048b5e81SShri Abhyankar /* backward solve the upper triangular */ 5444048b5e81SShri Abhyankar for (i=n-1; i>=0; i--){ 5445048b5e81SShri Abhyankar v = aa + adiag[i+1]+1; 5446048b5e81SShri Abhyankar vi = aj + adiag[i+1]+1; 5447048b5e81SShri Abhyankar nz = adiag[i]-adiag[i+1]-1; 5448048b5e81SShri Abhyankar sum = tmp[i]; 5449048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5450048b5e81SShri Abhyankar x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 5451048b5e81SShri Abhyankar } 5452048b5e81SShri Abhyankar 5453048b5e81SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5454048b5e81SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5455*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5456048b5e81SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5457048b5e81SShri Abhyankar ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 5458048b5e81SShri Abhyankar PetscFunctionReturn(0); 5459048b5e81SShri Abhyankar } 5460048b5e81SShri Abhyankar 546115091d37SBarry Smith /* 546215091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 546315091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 546415091d37SBarry Smith */ 54654a2ae208SSatish Balay #undef __FUNCT__ 546606e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 546706e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 546815091d37SBarry Smith { 546915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5470b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5471dfbe8321SBarry Smith PetscErrorCode ierr; 5472b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 5473b3260449SShri Abhyankar PetscScalar *x; 5474b3260449SShri Abhyankar const PetscScalar *b; 547587828ca2SBarry Smith PetscScalar s1,x1; 5476b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i; 547715091d37SBarry Smith 547815091d37SBarry Smith PetscFunctionBegin; 5479*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 54801ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 548115091d37SBarry Smith 548215091d37SBarry Smith /* forward solve the lower triangular */ 548315091d37SBarry Smith idx = 0; 548415091d37SBarry Smith x[0] = b[0]; 548515091d37SBarry Smith for (i=1; i<n; i++) { 548615091d37SBarry Smith v = aa + ai[i]; 548715091d37SBarry Smith vi = aj + ai[i]; 548815091d37SBarry Smith nz = diag[i] - ai[i]; 548915091d37SBarry Smith idx += 1; 5490f1af5d2fSBarry Smith s1 = b[idx]; 549115091d37SBarry Smith while (nz--) { 549215091d37SBarry Smith jdx = *vi++; 549315091d37SBarry Smith x1 = x[jdx]; 5494f1af5d2fSBarry Smith s1 -= v[0]*x1; 549515091d37SBarry Smith v += 1; 549615091d37SBarry Smith } 5497f1af5d2fSBarry Smith x[idx] = s1; 549815091d37SBarry Smith } 549915091d37SBarry Smith /* backward solve the upper triangular */ 550015091d37SBarry Smith for (i=n-1; i>=0; i--){ 550115091d37SBarry Smith v = aa + diag[i] + 1; 550215091d37SBarry Smith vi = aj + diag[i] + 1; 550315091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 550415091d37SBarry Smith idt = i; 5505f1af5d2fSBarry Smith s1 = x[idt]; 550615091d37SBarry Smith while (nz--) { 550715091d37SBarry Smith idx = *vi++; 550815091d37SBarry Smith x1 = x[idx]; 5509f1af5d2fSBarry Smith s1 -= v[0]*x1; 551015091d37SBarry Smith v += 1; 551115091d37SBarry Smith } 551215091d37SBarry Smith v = aa + diag[i]; 5513f1af5d2fSBarry Smith x[idt] = v[0]*s1; 551415091d37SBarry Smith } 5515*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 55161ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5517dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 551815091d37SBarry Smith PetscFunctionReturn(0); 551915091d37SBarry Smith } 55204e2b4712SSatish Balay 5521048b5e81SShri Abhyankar 5522048b5e81SShri Abhyankar #undef __FUNCT__ 5523048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5524048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 5525048b5e81SShri Abhyankar { 5526048b5e81SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5527048b5e81SShri Abhyankar PetscErrorCode ierr; 5528048b5e81SShri Abhyankar const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi; 5529048b5e81SShri Abhyankar PetscScalar *x,sum; 5530048b5e81SShri Abhyankar const PetscScalar *b; 5531048b5e81SShri Abhyankar const MatScalar *aa = a->a,*v; 5532048b5e81SShri Abhyankar PetscInt i,nz; 5533048b5e81SShri Abhyankar 5534048b5e81SShri Abhyankar PetscFunctionBegin; 5535048b5e81SShri Abhyankar if (!n) PetscFunctionReturn(0); 5536048b5e81SShri Abhyankar 5537*3649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5538048b5e81SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5539048b5e81SShri Abhyankar 5540048b5e81SShri Abhyankar /* forward solve the lower triangular */ 5541048b5e81SShri Abhyankar x[0] = b[0]; 5542048b5e81SShri Abhyankar v = aa; 5543048b5e81SShri Abhyankar vi = aj; 5544048b5e81SShri Abhyankar for (i=1; i<n; i++) { 5545048b5e81SShri Abhyankar nz = ai[i+1] - ai[i]; 5546048b5e81SShri Abhyankar sum = b[i]; 5547048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5548048b5e81SShri Abhyankar v += nz; 5549048b5e81SShri Abhyankar vi += nz; 5550048b5e81SShri Abhyankar x[i] = sum; 5551048b5e81SShri Abhyankar } 5552048b5e81SShri Abhyankar 5553048b5e81SShri Abhyankar /* backward solve the upper triangular */ 5554048b5e81SShri Abhyankar for (i=n-1; i>=0; i--){ 5555048b5e81SShri Abhyankar v = aa + adiag[i+1] + 1; 5556048b5e81SShri Abhyankar vi = aj + adiag[i+1] + 1; 5557048b5e81SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5558048b5e81SShri Abhyankar sum = x[i]; 5559048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5560048b5e81SShri Abhyankar x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */ 5561048b5e81SShri Abhyankar } 5562048b5e81SShri Abhyankar 5563048b5e81SShri Abhyankar ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 5564*3649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5565048b5e81SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5566048b5e81SShri Abhyankar PetscFunctionReturn(0); 5567048b5e81SShri Abhyankar } 5568048b5e81SShri Abhyankar 55694e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 557016a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 55716bce7ff8SHong Zhang 55722b0b2ea7SShri Abhyankar #undef __FUNCT__ 557329a97285SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering" 5574766f9fbaSBarry Smith /* 5575766f9fbaSBarry Smith This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 5576766f9fbaSBarry Smith */ 557729a97285SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 55782b0b2ea7SShri Abhyankar { 55792b0b2ea7SShri Abhyankar Mat C=B; 55802b0b2ea7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 55812b0b2ea7SShri Abhyankar PetscErrorCode ierr; 5582766f9fbaSBarry Smith PetscInt i,j,k,ipvt[15]; 5583766f9fbaSBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj; 5584766f9fbaSBarry Smith PetscInt nz,nzL,row; 5585766f9fbaSBarry Smith MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225]; 5586766f9fbaSBarry Smith const MatScalar *v,*aa=a->a; 55872b0b2ea7SShri Abhyankar PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 55880fa040f9SShri Abhyankar PetscInt sol_ver; 55892b0b2ea7SShri Abhyankar 55902b0b2ea7SShri Abhyankar PetscFunctionBegin; 55912b0b2ea7SShri Abhyankar 55920fa040f9SShri Abhyankar ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr); 55930fa040f9SShri Abhyankar 55942b0b2ea7SShri Abhyankar /* generate work space needed by the factorization */ 55952b0b2ea7SShri Abhyankar ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 55962b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 55972b0b2ea7SShri Abhyankar 55982b0b2ea7SShri Abhyankar for (i=0; i<n; i++){ 55992b0b2ea7SShri Abhyankar /* zero rtmp */ 56002b0b2ea7SShri Abhyankar /* L part */ 56012b0b2ea7SShri Abhyankar nz = bi[i+1] - bi[i]; 56022b0b2ea7SShri Abhyankar bjtmp = bj + bi[i]; 56032b0b2ea7SShri Abhyankar for (j=0; j<nz; j++){ 56042b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56052b0b2ea7SShri Abhyankar } 56062b0b2ea7SShri Abhyankar 56072b0b2ea7SShri Abhyankar /* U part */ 56082b0b2ea7SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 56092b0b2ea7SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 56102b0b2ea7SShri Abhyankar for (j=0; j<nz; j++){ 56112b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56122b0b2ea7SShri Abhyankar } 56132b0b2ea7SShri Abhyankar 56142b0b2ea7SShri Abhyankar /* load in initial (unfactored row) */ 561529a97285SShri Abhyankar nz = ai[i+1] - ai[i]; 561629a97285SShri Abhyankar ajtmp = aj + ai[i]; 561729a97285SShri Abhyankar v = aa + bs2*ai[i]; 56182b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 561929a97285SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 56202b0b2ea7SShri Abhyankar } 56212b0b2ea7SShri Abhyankar 56222b0b2ea7SShri Abhyankar /* elimination */ 56232b0b2ea7SShri Abhyankar bjtmp = bj + bi[i]; 56242b0b2ea7SShri Abhyankar nzL = bi[i+1] - bi[i]; 56252b0b2ea7SShri Abhyankar for(k=0;k < nzL;k++) { 56262b0b2ea7SShri Abhyankar row = bjtmp[k]; 56272b0b2ea7SShri Abhyankar pc = rtmp + bs2*row; 56282b0b2ea7SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 56292b0b2ea7SShri Abhyankar if (flg) { 56302b0b2ea7SShri Abhyankar pv = b->a + bs2*bdiag[row]; 5631766f9fbaSBarry Smith Kernel_A_gets_A_times_B(bs,pc,pv,mwork); 5632766f9fbaSBarry Smith /*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/ 56332b0b2ea7SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 56342b0b2ea7SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 56352b0b2ea7SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 56362b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 5637766f9fbaSBarry Smith vv = rtmp + bs2*pj[j]; 5638766f9fbaSBarry Smith Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv); 5639766f9fbaSBarry Smith /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */ 56402b0b2ea7SShri Abhyankar pv += bs2; 56412b0b2ea7SShri Abhyankar } 5642766f9fbaSBarry Smith ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 56432b0b2ea7SShri Abhyankar } 56442b0b2ea7SShri Abhyankar } 56452b0b2ea7SShri Abhyankar 56462b0b2ea7SShri Abhyankar /* finished row so stick it into b->a */ 56472b0b2ea7SShri Abhyankar /* L part */ 56482b0b2ea7SShri Abhyankar pv = b->a + bs2*bi[i] ; 56492b0b2ea7SShri Abhyankar pj = b->j + bi[i] ; 56502b0b2ea7SShri Abhyankar nz = bi[i+1] - bi[i]; 56512b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 56522b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56532b0b2ea7SShri Abhyankar } 56542b0b2ea7SShri Abhyankar 56552b0b2ea7SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 56562b0b2ea7SShri Abhyankar pv = b->a + bs2*bdiag[i]; 56572b0b2ea7SShri Abhyankar pj = b->j + bdiag[i]; 56582b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5659766f9fbaSBarry Smith /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */ 5660182b8fbaSHong Zhang ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr); 56612b0b2ea7SShri Abhyankar 56622b0b2ea7SShri Abhyankar /* U part */ 56632b0b2ea7SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 56642b0b2ea7SShri Abhyankar pj = b->j + bdiag[i+1]+1; 56652b0b2ea7SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 56662b0b2ea7SShri Abhyankar for (j=0; j<nz; j++){ 56672b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56682b0b2ea7SShri Abhyankar } 56692b0b2ea7SShri Abhyankar } 56702b0b2ea7SShri Abhyankar 56712b0b2ea7SShri Abhyankar ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5672832cc040SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 5673766f9fbaSBarry Smith C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 56742b0b2ea7SShri Abhyankar C->assembled = PETSC_TRUE; 5675766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 56762b0b2ea7SShri Abhyankar PetscFunctionReturn(0); 56772b0b2ea7SShri Abhyankar } 56782b0b2ea7SShri Abhyankar 56796bce7ff8SHong Zhang #undef __FUNCT__ 56804dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 56814dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 56826bce7ff8SHong Zhang { 56836bce7ff8SHong Zhang Mat C=B; 56846bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 56856bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 56866bce7ff8SHong Zhang PetscErrorCode ierr; 56876bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 56886bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 56896bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5690b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5691914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5692914a18a2SHong Zhang MatScalar *v_work; 5693ae3d28f0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 56946bce7ff8SHong Zhang 56956bce7ff8SHong Zhang PetscFunctionBegin; 56966bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 56976bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5698ae3d28f0SHong Zhang 5699fca92195SBarry Smith ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5700fca92195SBarry Smith ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 57016bce7ff8SHong Zhang ics = ic; 57026bce7ff8SHong Zhang 5703914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 5704fca92195SBarry Smith ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5705914a18a2SHong Zhang 57066bce7ff8SHong Zhang for (i=0; i<n; i++){ 57076bce7ff8SHong Zhang /* zero rtmp */ 57086bce7ff8SHong Zhang /* L part */ 57096bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 57106bce7ff8SHong Zhang bjtmp = bj + bi[i]; 5711914a18a2SHong Zhang for (j=0; j<nz; j++){ 5712914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5713914a18a2SHong Zhang } 57146bce7ff8SHong Zhang 57156bce7ff8SHong Zhang /* U part */ 57161a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 57171a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 57181a83e813SShri Abhyankar for (j=0; j<nz; j++){ 57191a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 57201a83e813SShri Abhyankar } 57211a83e813SShri Abhyankar 57221a83e813SShri Abhyankar /* load in initial (unfactored row) */ 57231a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 57241a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 57251a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 57261a83e813SShri Abhyankar for (j=0; j<nz; j++) { 57271a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 57281a83e813SShri Abhyankar } 57291a83e813SShri Abhyankar 57301a83e813SShri Abhyankar /* elimination */ 57311a83e813SShri Abhyankar bjtmp = bj + bi[i]; 57321a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 57331a83e813SShri Abhyankar for(k=0;k < nzL;k++) { 57341a83e813SShri Abhyankar row = bjtmp[k]; 57351a83e813SShri Abhyankar pc = rtmp + bs2*row; 57361a83e813SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 57371a83e813SShri Abhyankar if (flg) { 57381a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 57391a83e813SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 57401a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 57411a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 57421a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 57431a83e813SShri Abhyankar for (j=0; j<nz; j++) { 57441a83e813SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 57451a83e813SShri Abhyankar } 57461a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 57471a83e813SShri Abhyankar } 57481a83e813SShri Abhyankar } 57491a83e813SShri Abhyankar 57501a83e813SShri Abhyankar /* finished row so stick it into b->a */ 57511a83e813SShri Abhyankar /* L part */ 57521a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 57531a83e813SShri Abhyankar pj = b->j + bi[i] ; 57541a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 57551a83e813SShri Abhyankar for (j=0; j<nz; j++) { 57561a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 57571a83e813SShri Abhyankar } 57581a83e813SShri Abhyankar 57591a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 57601a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 57611a83e813SShri Abhyankar pj = b->j + bdiag[i]; 5762e32f2f54SBarry Smith /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 57631a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 57641a83e813SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 57651a83e813SShri Abhyankar 57661a83e813SShri Abhyankar /* U part */ 57671a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 57681a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 57691a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 57701a83e813SShri Abhyankar for (j=0; j<nz; j++){ 57711a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 57721a83e813SShri Abhyankar } 57731a83e813SShri Abhyankar } 57741a83e813SShri Abhyankar 57751a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 5776fca92195SBarry Smith ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 57771a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 57781a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 57791a83e813SShri Abhyankar 5780ae3d28f0SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5781ae3d28f0SHong Zhang ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5782ae3d28f0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 5783ae3d28f0SHong Zhang if (both_identity){ 57844dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5785ae3d28f0SHong Zhang } else { 57864dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N; 5787ae3d28f0SHong Zhang } 57884dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5789ae3d28f0SHong Zhang 57901a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 5791766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 57921a83e813SShri Abhyankar PetscFunctionReturn(0); 57931a83e813SShri Abhyankar } 57941a83e813SShri Abhyankar 57956bce7ff8SHong Zhang /* 57966bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 57974dd39f65SShri Abhyankar See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 57984dd39f65SShri Abhyankar because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 57996bce7ff8SHong Zhang */ 5800c0c7eb62SShri Abhyankar 58016bce7ff8SHong Zhang #undef __FUNCT__ 58024dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 58034dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 58046bce7ff8SHong Zhang { 58056bce7ff8SHong Zhang 58066bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 58076bce7ff8SHong Zhang PetscErrorCode ierr; 580816a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 580935aa4fcfSShri Abhyankar PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 581035aa4fcfSShri Abhyankar 581135aa4fcfSShri Abhyankar PetscFunctionBegin; 581235aa4fcfSShri Abhyankar ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 581335aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 581435aa4fcfSShri Abhyankar 581535aa4fcfSShri Abhyankar /* allocate matrix arrays for new data structure */ 581635aa4fcfSShri Abhyankar ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 581735aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 581835aa4fcfSShri Abhyankar b->singlemalloc = PETSC_TRUE; 581935aa4fcfSShri Abhyankar if (!b->diag){ 582035aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 582135aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 582235aa4fcfSShri Abhyankar } 582335aa4fcfSShri Abhyankar bdiag = b->diag; 582435aa4fcfSShri Abhyankar 582535aa4fcfSShri Abhyankar if (n > 0) { 582635aa4fcfSShri Abhyankar ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 582735aa4fcfSShri Abhyankar } 582835aa4fcfSShri Abhyankar 582935aa4fcfSShri Abhyankar /* set bi and bj with new data structure */ 583035aa4fcfSShri Abhyankar bi = b->i; 583135aa4fcfSShri Abhyankar bj = b->j; 583235aa4fcfSShri Abhyankar 583335aa4fcfSShri Abhyankar /* L part */ 583435aa4fcfSShri Abhyankar bi[0] = 0; 583535aa4fcfSShri Abhyankar for (i=0; i<n; i++){ 583635aa4fcfSShri Abhyankar nz = adiag[i] - ai[i]; 583735aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nz; 583835aa4fcfSShri Abhyankar aj = a->j + ai[i]; 583935aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 584035aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 584135aa4fcfSShri Abhyankar } 584235aa4fcfSShri Abhyankar } 584335aa4fcfSShri Abhyankar 584435aa4fcfSShri Abhyankar /* U part */ 584535aa4fcfSShri Abhyankar bi_temp = bi[n]; 584635aa4fcfSShri Abhyankar bdiag[n] = bi[n]-1; 584735aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 584835aa4fcfSShri Abhyankar nz = ai[i+1] - adiag[i] - 1; 584935aa4fcfSShri Abhyankar bi_temp = bi_temp + nz + 1; 585035aa4fcfSShri Abhyankar aj = a->j + adiag[i] + 1; 585135aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 585235aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 585335aa4fcfSShri Abhyankar } 585435aa4fcfSShri Abhyankar /* diag[i] */ 585535aa4fcfSShri Abhyankar *bj = i; bj++; 585635aa4fcfSShri Abhyankar bdiag[i] = bi_temp - 1; 585735aa4fcfSShri Abhyankar } 585835aa4fcfSShri Abhyankar PetscFunctionReturn(0); 585935aa4fcfSShri Abhyankar } 586035aa4fcfSShri Abhyankar 586135aa4fcfSShri Abhyankar #undef __FUNCT__ 58624dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 58634dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 586416a2bf60SHong Zhang { 586516a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 586616a2bf60SHong Zhang IS isicol; 586716a2bf60SHong Zhang PetscErrorCode ierr; 586816a2bf60SHong Zhang const PetscInt *r,*ic; 58697fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 587016a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 587116a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 587216a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 58737fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 587416a2bf60SHong Zhang PetscReal f; 587516a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 587616a2bf60SHong Zhang PetscBT lnkbt; 587716a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 587816a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 587916a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 588016a2bf60SHong Zhang PetscTruth missing; 58817fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 588216a2bf60SHong Zhang 588316a2bf60SHong Zhang PetscFunctionBegin; 5884e32f2f54SBarry Smith if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 58856ba06ab7SHong Zhang if (bs>1){ /* check shifttype */ 58866ba06ab7SHong Zhang if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) 58876ba06ab7SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 58886ba06ab7SHong Zhang } 58896ba06ab7SHong Zhang 589016a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5891e32f2f54SBarry Smith if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 589216a2bf60SHong Zhang 589316a2bf60SHong Zhang f = info->fill; 589416a2bf60SHong Zhang levels = (PetscInt)info->levels; 589516a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 589616a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 589716a2bf60SHong Zhang 589816a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 589916a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 59007fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 590116a2bf60SHong Zhang 59027fa3a6a0SHong Zhang if (!levels && both_identity) { 590316a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 59044dd39f65SShri Abhyankar ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 59054dd39f65SShri Abhyankar ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 590635aa4fcfSShri Abhyankar 5907d5f3da31SBarry Smith fact->factortype = MAT_FACTOR_ILU; 590835aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = 0; 590935aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = info->fill; 591035aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = 1.0; 591135aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 591235aa4fcfSShri Abhyankar b->row = isrow; 591335aa4fcfSShri Abhyankar b->col = iscol; 591435aa4fcfSShri Abhyankar b->icol = isicol; 591535aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 591635aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 591735aa4fcfSShri Abhyankar b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 591835aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 591935aa4fcfSShri Abhyankar PetscFunctionReturn(0); 592035aa4fcfSShri Abhyankar } 592135aa4fcfSShri Abhyankar 592235aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 592335aa4fcfSShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 592435aa4fcfSShri Abhyankar 592535aa4fcfSShri Abhyankar /* get new row pointers */ 592635aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 592735aa4fcfSShri Abhyankar bi[0] = 0; 592835aa4fcfSShri Abhyankar /* bdiag is location of diagonal in factor */ 592935aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 593035aa4fcfSShri Abhyankar bdiag[0] = 0; 593135aa4fcfSShri Abhyankar 5932fca92195SBarry Smith ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 593335aa4fcfSShri Abhyankar 593435aa4fcfSShri Abhyankar /* create a linked list for storing column indices of the active row */ 593535aa4fcfSShri Abhyankar nlnk = n + 1; 593635aa4fcfSShri Abhyankar ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 593735aa4fcfSShri Abhyankar 593835aa4fcfSShri Abhyankar /* initial FreeSpace size is f*(ai[n]+1) */ 593935aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 594035aa4fcfSShri Abhyankar current_space = free_space; 594135aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 594235aa4fcfSShri Abhyankar current_space_lvl = free_space_lvl; 594335aa4fcfSShri Abhyankar 594435aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 594535aa4fcfSShri Abhyankar nzi = 0; 594635aa4fcfSShri Abhyankar /* copy current row into linked list */ 594735aa4fcfSShri Abhyankar nnz = ai[r[i]+1] - ai[r[i]]; 5948e32f2f54SBarry Smith if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 594935aa4fcfSShri Abhyankar cols = aj + ai[r[i]]; 595035aa4fcfSShri Abhyankar lnk[i] = -1; /* marker to indicate if diagonal exists */ 595135aa4fcfSShri Abhyankar ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 595235aa4fcfSShri Abhyankar nzi += nlnk; 595335aa4fcfSShri Abhyankar 595435aa4fcfSShri Abhyankar /* make sure diagonal entry is included */ 595535aa4fcfSShri Abhyankar if (diagonal_fill && lnk[i] == -1) { 595635aa4fcfSShri Abhyankar fm = n; 595735aa4fcfSShri Abhyankar while (lnk[fm] < i) fm = lnk[fm]; 595835aa4fcfSShri Abhyankar lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 595935aa4fcfSShri Abhyankar lnk[fm] = i; 596035aa4fcfSShri Abhyankar lnk_lvl[i] = 0; 596135aa4fcfSShri Abhyankar nzi++; dcount++; 596235aa4fcfSShri Abhyankar } 596335aa4fcfSShri Abhyankar 596435aa4fcfSShri Abhyankar /* add pivot rows into the active row */ 596535aa4fcfSShri Abhyankar nzbd = 0; 596635aa4fcfSShri Abhyankar prow = lnk[n]; 596735aa4fcfSShri Abhyankar while (prow < i) { 596835aa4fcfSShri Abhyankar nnz = bdiag[prow]; 596935aa4fcfSShri Abhyankar cols = bj_ptr[prow] + nnz + 1; 597035aa4fcfSShri Abhyankar cols_lvl = bjlvl_ptr[prow] + nnz + 1; 597135aa4fcfSShri Abhyankar nnz = bi[prow+1] - bi[prow] - nnz - 1; 597235aa4fcfSShri Abhyankar ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 597335aa4fcfSShri Abhyankar nzi += nlnk; 597435aa4fcfSShri Abhyankar prow = lnk[prow]; 597535aa4fcfSShri Abhyankar nzbd++; 597635aa4fcfSShri Abhyankar } 597735aa4fcfSShri Abhyankar bdiag[i] = nzbd; 597835aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nzi; 597935aa4fcfSShri Abhyankar 598035aa4fcfSShri Abhyankar /* if free space is not available, make more free space */ 598135aa4fcfSShri Abhyankar if (current_space->local_remaining<nzi) { 598235aa4fcfSShri Abhyankar nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 598335aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 598435aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 598535aa4fcfSShri Abhyankar reallocs++; 598635aa4fcfSShri Abhyankar } 598735aa4fcfSShri Abhyankar 598835aa4fcfSShri Abhyankar /* copy data into free_space and free_space_lvl, then initialize lnk */ 598935aa4fcfSShri Abhyankar ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 599035aa4fcfSShri Abhyankar bj_ptr[i] = current_space->array; 599135aa4fcfSShri Abhyankar bjlvl_ptr[i] = current_space_lvl->array; 599235aa4fcfSShri Abhyankar 599335aa4fcfSShri Abhyankar /* make sure the active row i has diagonal entry */ 599465e19b50SBarry Smith if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 599535aa4fcfSShri Abhyankar 599635aa4fcfSShri Abhyankar current_space->array += nzi; 599735aa4fcfSShri Abhyankar current_space->local_used += nzi; 599835aa4fcfSShri Abhyankar current_space->local_remaining -= nzi; 599935aa4fcfSShri Abhyankar current_space_lvl->array += nzi; 600035aa4fcfSShri Abhyankar current_space_lvl->local_used += nzi; 600135aa4fcfSShri Abhyankar current_space_lvl->local_remaining -= nzi; 600235aa4fcfSShri Abhyankar } 600335aa4fcfSShri Abhyankar 600435aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 600535aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 600635aa4fcfSShri Abhyankar 600735aa4fcfSShri Abhyankar /* destroy list of free space and other temporary arrays */ 600835aa4fcfSShri Abhyankar ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 600935aa4fcfSShri Abhyankar 601035aa4fcfSShri Abhyankar /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 60112ce24eb6SHong Zhang ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 601235aa4fcfSShri Abhyankar 601335aa4fcfSShri Abhyankar ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 601435aa4fcfSShri Abhyankar ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 6015fca92195SBarry Smith ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 601635aa4fcfSShri Abhyankar 601735aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO) 601835aa4fcfSShri Abhyankar { 6019aef85c9fSShri Abhyankar PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 602035aa4fcfSShri Abhyankar ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 602135aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 602235aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 602335aa4fcfSShri Abhyankar ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 602435aa4fcfSShri Abhyankar if (diagonal_fill) { 602535aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 602635aa4fcfSShri Abhyankar } 602735aa4fcfSShri Abhyankar } 602835aa4fcfSShri Abhyankar #endif 602935aa4fcfSShri Abhyankar 603035aa4fcfSShri Abhyankar /* put together the new matrix */ 603135aa4fcfSShri Abhyankar ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 603235aa4fcfSShri Abhyankar ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 603335aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 603435aa4fcfSShri Abhyankar b->free_a = PETSC_TRUE; 603535aa4fcfSShri Abhyankar b->free_ij = PETSC_TRUE; 603635aa4fcfSShri Abhyankar b->singlemalloc = PETSC_FALSE; 603735aa4fcfSShri Abhyankar ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 603835aa4fcfSShri Abhyankar b->j = bj; 603935aa4fcfSShri Abhyankar b->i = bi; 604035aa4fcfSShri Abhyankar b->diag = bdiag; 604135aa4fcfSShri Abhyankar b->free_diag = PETSC_TRUE; 604235aa4fcfSShri Abhyankar b->ilen = 0; 604335aa4fcfSShri Abhyankar b->imax = 0; 604435aa4fcfSShri Abhyankar b->row = isrow; 604535aa4fcfSShri Abhyankar b->col = iscol; 604635aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 604735aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 604835aa4fcfSShri Abhyankar b->icol = isicol; 604935aa4fcfSShri Abhyankar ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 605035aa4fcfSShri Abhyankar /* In b structure: Free imax, ilen, old a, old j. 605135aa4fcfSShri Abhyankar Allocate bdiag, solve_work, new a, new j */ 605235aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 605335aa4fcfSShri Abhyankar b->maxnz = b->nz = bdiag[0]+1; 6054ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocs; 6055ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 6056ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 60574dd39f65SShri Abhyankar ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 605835aa4fcfSShri Abhyankar PetscFunctionReturn(0); 605935aa4fcfSShri Abhyankar } 606035aa4fcfSShri Abhyankar 60614e2b4712SSatish Balay /* 60624e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 60634e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 60644e2b4712SSatish Balay Not a good example of code reuse. 60654e2b4712SSatish Balay */ 60664a2ae208SSatish Balay #undef __FUNCT__ 606706e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 606806e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 60694e2b4712SSatish Balay { 60704e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 60714e2b4712SSatish Balay IS isicol; 60726849ba73SBarry Smith PetscErrorCode ierr; 60735d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 60745d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 6075a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 6076d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 607741df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 6078329f5518SBarry Smith PetscReal f; 60794e2b4712SSatish Balay 60804e2b4712SSatish Balay PetscFunctionBegin; 60816bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 6082e32f2f54SBarry Smith if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 60836bce7ff8SHong Zhang 6084435faa5fSBarry Smith f = info->fill; 6085690b6cddSBarry Smith levels = (PetscInt)info->levels; 6086690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 60874c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 608816a2bf60SHong Zhang 6089667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 6090667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 60917d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 6092309c388cSBarry Smith 609341df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 609416a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 60958b1456e3SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 60966bce7ff8SHong Zhang 6097d5f3da31SBarry Smith fact->factortype = MAT_FACTOR_ILU; 6098ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 6099bb3d539aSBarry Smith b->row = isrow; 6100bb3d539aSBarry Smith b->col = iscol; 6101bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6102bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6103bb3d539aSBarry Smith b->icol = isicol; 6104bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6105b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 61066bce7ff8SHong Zhang PetscFunctionReturn(0); 61076bce7ff8SHong Zhang } 61086bce7ff8SHong Zhang 61096bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 61104e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 61114e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 61124e2b4712SSatish Balay 61134e2b4712SSatish Balay /* get new row pointers */ 6114690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 61154e2b4712SSatish Balay ainew[0] = 0; 61164e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 6117690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 6118690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 61194e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 6120690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 61214e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 6122690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 61234e2b4712SSatish Balay /* im is level for each filled value */ 6124690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 61254e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 6126690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 61274e2b4712SSatish Balay dloc[0] = 0; 61284e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 6129435faa5fSBarry Smith 6130435faa5fSBarry Smith /* copy prow into linked list */ 61314e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 6132e32f2f54SBarry Smith if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 61334e2b4712SSatish Balay xi = aj + ai[r[prow]]; 61344e2b4712SSatish Balay fill[n] = n; 6135435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 61364e2b4712SSatish Balay while (nz--) { 61374e2b4712SSatish Balay fm = n; 61384e2b4712SSatish Balay idx = ic[*xi++]; 61394e2b4712SSatish Balay do { 61404e2b4712SSatish Balay m = fm; 61414e2b4712SSatish Balay fm = fill[m]; 61424e2b4712SSatish Balay } while (fm < idx); 61434e2b4712SSatish Balay fill[m] = idx; 61444e2b4712SSatish Balay fill[idx] = fm; 61454e2b4712SSatish Balay im[idx] = 0; 61464e2b4712SSatish Balay } 6147435faa5fSBarry Smith 6148435faa5fSBarry Smith /* make sure diagonal entry is included */ 6149435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 6150435faa5fSBarry Smith fm = n; 6151435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 6152435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 6153435faa5fSBarry Smith fill[fm] = prow; 6154435faa5fSBarry Smith im[prow] = 0; 6155435faa5fSBarry Smith nzf++; 6156335d9088SBarry Smith dcount++; 6157435faa5fSBarry Smith } 6158435faa5fSBarry Smith 61594e2b4712SSatish Balay nzi = 0; 61604e2b4712SSatish Balay row = fill[n]; 61614e2b4712SSatish Balay while (row < prow) { 61624e2b4712SSatish Balay incrlev = im[row] + 1; 61634e2b4712SSatish Balay nz = dloc[row]; 6164435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 61654e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 61664e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 61674e2b4712SSatish Balay fm = row; 61684e2b4712SSatish Balay while (nnz-- > 0) { 61694e2b4712SSatish Balay idx = *xi++; 61704e2b4712SSatish Balay if (*flev + incrlev > levels) { 61714e2b4712SSatish Balay flev++; 61724e2b4712SSatish Balay continue; 61734e2b4712SSatish Balay } 61744e2b4712SSatish Balay do { 61754e2b4712SSatish Balay m = fm; 61764e2b4712SSatish Balay fm = fill[m]; 61774e2b4712SSatish Balay } while (fm < idx); 61784e2b4712SSatish Balay if (fm != idx) { 61794e2b4712SSatish Balay im[idx] = *flev + incrlev; 61804e2b4712SSatish Balay fill[m] = idx; 61814e2b4712SSatish Balay fill[idx] = fm; 61824e2b4712SSatish Balay fm = idx; 61834e2b4712SSatish Balay nzf++; 6184ecf371e4SBarry Smith } else { 61854e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 61864e2b4712SSatish Balay } 61874e2b4712SSatish Balay flev++; 61884e2b4712SSatish Balay } 61894e2b4712SSatish Balay row = fill[row]; 61904e2b4712SSatish Balay nzi++; 61914e2b4712SSatish Balay } 61924e2b4712SSatish Balay /* copy new filled row into permanent storage */ 61934e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 61944e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 6195ecf371e4SBarry Smith 6196ecf371e4SBarry Smith /* estimate how much additional space we will need */ 6197ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 6198ecf371e4SBarry Smith /* just double the memory each time */ 6199690b6cddSBarry Smith PetscInt maxadd = jmax; 6200ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 62014e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 62024e2b4712SSatish Balay jmax += maxadd; 6203ecf371e4SBarry Smith 6204ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 62055d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 62065d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6207606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 62085d0c19d7SBarry Smith ajnew = xitmp; 62095d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 62105d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6211606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 62125d0c19d7SBarry Smith ajfill = xitmp; 6213eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 62144e2b4712SSatish Balay } 62155d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 62164e2b4712SSatish Balay flev = ajfill + ainew[prow]; 62174e2b4712SSatish Balay dloc[prow] = nzi; 62184e2b4712SSatish Balay fm = fill[n]; 62194e2b4712SSatish Balay while (nzf--) { 62205d0c19d7SBarry Smith *xitmp++ = fm; 62214e2b4712SSatish Balay *flev++ = im[fm]; 62224e2b4712SSatish Balay fm = fill[fm]; 62234e2b4712SSatish Balay } 6224435faa5fSBarry Smith /* make sure row has diagonal entry */ 6225435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 6226e32f2f54SBarry Smith SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 62272401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 6228435faa5fSBarry Smith } 62294e2b4712SSatish Balay } 6230606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 62314e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 62324e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6233606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 6234606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 62354e2b4712SSatish Balay 62366cf91177SBarry Smith #if defined(PETSC_USE_INFO) 62374e2b4712SSatish Balay { 6238329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 6239ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 6240ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6241ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 6242ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6243335d9088SBarry Smith if (diagonal_fill) { 6244ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 6245335d9088SBarry Smith } 62464e2b4712SSatish Balay } 624763ba0a88SBarry Smith #endif 62484e2b4712SSatish Balay 62494e2b4712SSatish Balay /* put together the new matrix */ 6250719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6251719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6252ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 6253e6b907acSBarry Smith b->free_a = PETSC_TRUE; 6254e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 62557c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 6256a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 62574e2b4712SSatish Balay b->j = ajnew; 62584e2b4712SSatish Balay b->i = ainew; 62594e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 62604e2b4712SSatish Balay b->diag = dloc; 62617f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 62624e2b4712SSatish Balay b->ilen = 0; 62634e2b4712SSatish Balay b->imax = 0; 62644e2b4712SSatish Balay b->row = isrow; 62654e2b4712SSatish Balay b->col = iscol; 6266bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6267c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6268c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6269e51c0b9cSSatish Balay b->icol = isicol; 627087828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 62714e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 62724e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 6273719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 62744e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 62754e2b4712SSatish Balay 6276ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocate; 6277ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 6278ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 62796bce7ff8SHong Zhang 62808b1456e3SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 62818661488fSKris Buschelman PetscFunctionReturn(0); 62828661488fSKris Buschelman } 62838661488fSKris Buschelman 6284732ee342SKris Buschelman #undef __FUNCT__ 62857e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 6286dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 62877e7071cdSKris Buschelman { 628812272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 628912272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 62905a9542e3SKris Buschelman PetscFunctionBegin; 62917cf1b8d3SKris Buschelman /* Undo Column scaling */ 62927cf1b8d3SKris Buschelman /* while (nz--) { */ 62937cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 62947cf1b8d3SKris Buschelman /* } */ 6295c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 6296c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 62977cf1b8d3SKris Buschelman PetscFunctionReturn(0); 62987cf1b8d3SKris Buschelman } 62997cf1b8d3SKris Buschelman 63007cf1b8d3SKris Buschelman #undef __FUNCT__ 63017cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 6302dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 63037cf1b8d3SKris Buschelman { 63047cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6305b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 63062aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 63075a9542e3SKris Buschelman PetscFunctionBegin; 63080b9da03eSKris Buschelman /* Is this really necessary? */ 630920235379SKris Buschelman while (nz--) { 63100b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 63117e7071cdSKris Buschelman } 6312c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 63137e7071cdSKris Buschelman PetscFunctionReturn(0); 63147e7071cdSKris Buschelman } 63157e7071cdSKris Buschelman 6316732ee342SKris Buschelman 6317