1be1d678aSKris Buschelman 24e2b4712SSatish Balay /* 34e2b4712SSatish Balay Factorization code for BAIJ format. 44e2b4712SSatish Balay */ 54e2b4712SSatish Balay 6c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 7c6db04a5SJed Brown #include <../src/mat/blockinvert.h> 8c6db04a5SJed Brown #include <petscbt.h> 9c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 104e2b4712SSatish Balay 114a2ae208SSatish Balay #undef __FUNCT__ 1293fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 1393fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 1493fd935bSShri Abhyankar { 1593fd935bSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 1693fd935bSShri Abhyankar PetscErrorCode ierr; 1793fd935bSShri Abhyankar const PetscInt *adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 1893fd935bSShri Abhyankar PetscInt i,n = a->mbs,j; 1993fd935bSShri Abhyankar PetscInt nz; 2093fd935bSShri Abhyankar PetscScalar *x,*tmp,s1; 2193fd935bSShri Abhyankar const MatScalar *aa = a->a,*v; 2293fd935bSShri Abhyankar const PetscScalar *b; 2393fd935bSShri Abhyankar 2493fd935bSShri Abhyankar PetscFunctionBegin; 253649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2693fd935bSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2793fd935bSShri Abhyankar tmp = a->solve_work; 2893fd935bSShri Abhyankar 2993fd935bSShri Abhyankar 3093fd935bSShri Abhyankar /* copy the b into temp work space according to permutation */ 3193fd935bSShri Abhyankar for (i=0; i<n; i++) tmp[i] = b[i]; 3293fd935bSShri Abhyankar 3393fd935bSShri Abhyankar /* forward solve the U^T */ 3493fd935bSShri Abhyankar for (i=0; i<n; i++) { 3593fd935bSShri Abhyankar v = aa + adiag[i+1] + 1; 3693fd935bSShri Abhyankar vi = aj + adiag[i+1] + 1; 3793fd935bSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 3893fd935bSShri Abhyankar s1 = tmp[i]; 3993fd935bSShri Abhyankar s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 4093fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 4193fd935bSShri Abhyankar tmp[i] = s1; 4293fd935bSShri Abhyankar } 4393fd935bSShri Abhyankar 4493fd935bSShri Abhyankar /* backward solve the L^T */ 4593fd935bSShri Abhyankar for (i=n-1; i>=0; i--){ 4693fd935bSShri Abhyankar v = aa + ai[i]; 4793fd935bSShri Abhyankar vi = aj + ai[i]; 4893fd935bSShri Abhyankar nz = ai[i+1] - ai[i]; 4993fd935bSShri Abhyankar s1 = tmp[i]; 5093fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 5193fd935bSShri Abhyankar } 5293fd935bSShri Abhyankar 5393fd935bSShri Abhyankar /* copy tmp into x according to permutation */ 5493fd935bSShri Abhyankar for (i=0; i<n; i++) x[i] = tmp[i]; 5593fd935bSShri Abhyankar 563649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5793fd935bSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5893fd935bSShri Abhyankar 5993fd935bSShri Abhyankar ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 6093fd935bSShri Abhyankar PetscFunctionReturn(0); 6193fd935bSShri Abhyankar } 6293fd935bSShri Abhyankar 6393fd935bSShri Abhyankar #undef __FUNCT__ 6406e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 6506e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 66f1af5d2fSBarry Smith { 67f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 68dfbe8321SBarry Smith PetscErrorCode ierr; 690b68f018SBarry Smith PetscInt i,nz; 700b68f018SBarry Smith const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 710b68f018SBarry Smith const MatScalar *aa=a->a,*v; 720b68f018SBarry Smith PetscScalar s1,*x; 730b68f018SBarry Smith const PetscScalar *b; 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith PetscFunctionBegin; 76ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 773649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 781ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 79f1af5d2fSBarry Smith 80f1af5d2fSBarry Smith /* forward solve the U^T */ 81f1af5d2fSBarry Smith for (i=0; i<n; i++) { 82f1af5d2fSBarry Smith 83f1af5d2fSBarry Smith v = aa + diag[i]; 84f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 85ef66eb69SBarry Smith s1 = (*v++)*x[i]; 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 90f1af5d2fSBarry Smith } 91f1af5d2fSBarry Smith x[i] = s1; 92f1af5d2fSBarry Smith } 93f1af5d2fSBarry Smith /* backward solve the L^T */ 94f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 95f1af5d2fSBarry Smith v = aa + diag[i] - 1; 96f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 97f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 98f1af5d2fSBarry Smith s1 = x[i]; 99f1af5d2fSBarry Smith while (nz--) { 100f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 101f1af5d2fSBarry Smith } 102f1af5d2fSBarry Smith } 1033649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1041ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 105dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 106f1af5d2fSBarry Smith PetscFunctionReturn(0); 107f1af5d2fSBarry Smith } 108f1af5d2fSBarry Smith 1094a2ae208SSatish Balay #undef __FUNCT__ 11006e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 11106e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 112f1af5d2fSBarry Smith { 113f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 114dfbe8321SBarry Smith PetscErrorCode ierr; 115b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 116b3260449SShri Abhyankar const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 117b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 118b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x; 119b3260449SShri Abhyankar const PetscScalar *b; 120f1af5d2fSBarry Smith 121f1af5d2fSBarry Smith PetscFunctionBegin; 122ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1233649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1241ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 125f1af5d2fSBarry Smith 126f1af5d2fSBarry Smith /* forward solve the U^T */ 127f1af5d2fSBarry Smith idx = 0; 128f1af5d2fSBarry Smith for (i=0; i<n; i++) { 129f1af5d2fSBarry Smith 130f1af5d2fSBarry Smith v = aa + 4*diag[i]; 131f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 132ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 133f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 134f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 135f1af5d2fSBarry Smith v += 4; 136f1af5d2fSBarry Smith 137f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 138f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 139f1af5d2fSBarry Smith while (nz--) { 140f1af5d2fSBarry Smith oidx = 2*(*vi++); 141f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 142f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 143f1af5d2fSBarry Smith v += 4; 144f1af5d2fSBarry Smith } 145f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 146f1af5d2fSBarry Smith idx += 2; 147f1af5d2fSBarry Smith } 148f1af5d2fSBarry Smith /* backward solve the L^T */ 149f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 150f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 151f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 152f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 153f1af5d2fSBarry Smith idt = 2*i; 154f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 155f1af5d2fSBarry Smith while (nz--) { 156f1af5d2fSBarry Smith idx = 2*(*vi--); 157f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 158f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 159f1af5d2fSBarry Smith v -= 4; 160f1af5d2fSBarry Smith } 161f1af5d2fSBarry Smith } 1623649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 1631ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 164dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 165f1af5d2fSBarry Smith PetscFunctionReturn(0); 166f1af5d2fSBarry Smith } 167f1af5d2fSBarry Smith 1684a2ae208SSatish Balay #undef __FUNCT__ 1694dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 1704dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 1716929473cSShri Abhyankar { 1726929473cSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1736929473cSShri Abhyankar PetscErrorCode ierr; 174b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1756929473cSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 176b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 177b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 178b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x; 179b3260449SShri Abhyankar const PetscScalar *b; 1806929473cSShri Abhyankar 1816929473cSShri Abhyankar PetscFunctionBegin; 1826929473cSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1833649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 1846929473cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1856929473cSShri Abhyankar 1866929473cSShri Abhyankar /* forward solve the U^T */ 1876929473cSShri Abhyankar idx = 0; 1886929473cSShri Abhyankar for (i=0; i<n; i++) { 1896929473cSShri Abhyankar v = aa + bs2*diag[i]; 1906929473cSShri Abhyankar /* multiply by the inverse of the block diagonal */ 1916929473cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 1926929473cSShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 1936929473cSShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 1946929473cSShri Abhyankar v -= bs2; 1956929473cSShri Abhyankar 1966929473cSShri Abhyankar vi = aj + diag[i] - 1; 1976929473cSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 1986929473cSShri Abhyankar for(j=0;j>-nz;j--){ 1996929473cSShri Abhyankar oidx = bs*vi[j]; 2006929473cSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2; 2016929473cSShri Abhyankar x[oidx+1] -= v[2]*s1 + v[3]*s2; 2026929473cSShri Abhyankar v -= bs2; 2036929473cSShri Abhyankar } 2046929473cSShri Abhyankar x[idx] = s1;x[1+idx] = s2; 2056929473cSShri Abhyankar idx += bs; 2066929473cSShri Abhyankar } 2076929473cSShri Abhyankar /* backward solve the L^T */ 2086929473cSShri Abhyankar for (i=n-1; i>=0; i--){ 2096929473cSShri Abhyankar v = aa + bs2*ai[i]; 2106929473cSShri Abhyankar vi = aj + ai[i]; 2116929473cSShri Abhyankar nz = ai[i+1] - ai[i]; 2126929473cSShri Abhyankar idt = bs*i; 2136929473cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 2146929473cSShri Abhyankar for(j=0;j<nz;j++){ 2156929473cSShri Abhyankar idx = bs*vi[j]; 2166929473cSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2; 2176929473cSShri Abhyankar x[idx+1] -= v[2]*s1 + v[3]*s2; 2186929473cSShri Abhyankar v += bs2; 2196929473cSShri Abhyankar } 2206929473cSShri Abhyankar } 2213649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2226929473cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2236929473cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2246929473cSShri Abhyankar PetscFunctionReturn(0); 2256929473cSShri Abhyankar } 2266929473cSShri Abhyankar 2276929473cSShri Abhyankar #undef __FUNCT__ 22806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 22906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 230f1af5d2fSBarry Smith { 231f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 232dfbe8321SBarry Smith PetscErrorCode ierr; 233b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 234b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 235b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 236b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x; 237b3260449SShri Abhyankar const PetscScalar *b; 238f1af5d2fSBarry Smith 239f1af5d2fSBarry Smith PetscFunctionBegin; 240ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 2413649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 2421ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 243f1af5d2fSBarry Smith 244f1af5d2fSBarry Smith /* forward solve the U^T */ 245f1af5d2fSBarry Smith idx = 0; 246f1af5d2fSBarry Smith for (i=0; i<n; i++) { 247f1af5d2fSBarry Smith 248f1af5d2fSBarry Smith v = aa + 9*diag[i]; 249f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 250ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 251f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 252f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 253f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 254f1af5d2fSBarry Smith v += 9; 255f1af5d2fSBarry Smith 256f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 257f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 258f1af5d2fSBarry Smith while (nz--) { 259f1af5d2fSBarry Smith oidx = 3*(*vi++); 260f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 261f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 262f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 263f1af5d2fSBarry Smith v += 9; 264f1af5d2fSBarry Smith } 265f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 266f1af5d2fSBarry Smith idx += 3; 267f1af5d2fSBarry Smith } 268f1af5d2fSBarry Smith /* backward solve the L^T */ 269f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 270f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 271f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 272f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 273f1af5d2fSBarry Smith idt = 3*i; 274f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 275f1af5d2fSBarry Smith while (nz--) { 276f1af5d2fSBarry Smith idx = 3*(*vi--); 277f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 278f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 279f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 280f1af5d2fSBarry Smith v -= 9; 281f1af5d2fSBarry Smith } 282f1af5d2fSBarry Smith } 2833649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 2841ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 285dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 286f1af5d2fSBarry Smith PetscFunctionReturn(0); 287f1af5d2fSBarry Smith } 288f1af5d2fSBarry Smith 2894a2ae208SSatish Balay #undef __FUNCT__ 2904dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 2914dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2928499736aSShri Abhyankar { 2938499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2948499736aSShri Abhyankar PetscErrorCode ierr; 295b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 2968499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 297b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 298b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 299b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x; 300b3260449SShri Abhyankar const PetscScalar *b; 3018499736aSShri Abhyankar 3028499736aSShri Abhyankar PetscFunctionBegin; 3038499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3043649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3058499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3068499736aSShri Abhyankar 3078499736aSShri Abhyankar /* forward solve the U^T */ 3088499736aSShri Abhyankar idx = 0; 3098499736aSShri Abhyankar for (i=0; i<n; i++) { 3108499736aSShri Abhyankar v = aa + bs2*diag[i]; 3118499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 3128499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3138499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 3148499736aSShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 3158499736aSShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 3168499736aSShri Abhyankar v -= bs2; 3178499736aSShri Abhyankar 3188499736aSShri Abhyankar vi = aj + diag[i] - 1; 3198499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 3208499736aSShri Abhyankar for(j=0;j>-nz;j--){ 3218499736aSShri Abhyankar oidx = bs*vi[j]; 3228499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 3238499736aSShri Abhyankar x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 3248499736aSShri Abhyankar x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 3258499736aSShri Abhyankar v -= bs2; 3268499736aSShri Abhyankar } 3278499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 3288499736aSShri Abhyankar idx += bs; 3298499736aSShri Abhyankar } 3308499736aSShri Abhyankar /* backward solve the L^T */ 3318499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 3328499736aSShri Abhyankar v = aa + bs2*ai[i]; 3338499736aSShri Abhyankar vi = aj + ai[i]; 3348499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 3358499736aSShri Abhyankar idt = bs*i; 3368499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 3378499736aSShri Abhyankar for(j=0;j<nz;j++){ 3388499736aSShri Abhyankar idx = bs*vi[j]; 3398499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 3408499736aSShri Abhyankar x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 3418499736aSShri Abhyankar x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 3428499736aSShri Abhyankar v += bs2; 3438499736aSShri Abhyankar } 3448499736aSShri Abhyankar } 3453649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 3468499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3478499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3488499736aSShri Abhyankar PetscFunctionReturn(0); 3498499736aSShri Abhyankar } 3508499736aSShri Abhyankar 3518499736aSShri Abhyankar #undef __FUNCT__ 35206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 35306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 354f1af5d2fSBarry Smith { 355f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 356dfbe8321SBarry Smith PetscErrorCode ierr; 357b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 358b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 359b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 360b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 361b3260449SShri Abhyankar const PetscScalar *b; 362f1af5d2fSBarry Smith 363f1af5d2fSBarry Smith PetscFunctionBegin; 364ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3653649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 3661ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 367f1af5d2fSBarry Smith 368f1af5d2fSBarry Smith /* forward solve the U^T */ 369f1af5d2fSBarry Smith idx = 0; 370f1af5d2fSBarry Smith for (i=0; i<n; i++) { 371f1af5d2fSBarry Smith 372f1af5d2fSBarry Smith v = aa + 16*diag[i]; 373f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 374ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 375f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 376f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 377f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 378f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 379f1af5d2fSBarry Smith v += 16; 380f1af5d2fSBarry Smith 381f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 382f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 383f1af5d2fSBarry Smith while (nz--) { 384f1af5d2fSBarry Smith oidx = 4*(*vi++); 385f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 386f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 387f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 388f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 389f1af5d2fSBarry Smith v += 16; 390f1af5d2fSBarry Smith } 391f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 392f1af5d2fSBarry Smith idx += 4; 393f1af5d2fSBarry Smith } 394f1af5d2fSBarry Smith /* backward solve the L^T */ 395f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 396f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 397f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 398f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 399f1af5d2fSBarry Smith idt = 4*i; 400f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 401f1af5d2fSBarry Smith while (nz--) { 402f1af5d2fSBarry Smith idx = 4*(*vi--); 403f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 404f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 405f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 406f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 407f1af5d2fSBarry Smith v -= 16; 408f1af5d2fSBarry Smith } 409f1af5d2fSBarry Smith } 4103649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4111ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 412dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 413f1af5d2fSBarry Smith PetscFunctionReturn(0); 414f1af5d2fSBarry Smith } 415f1af5d2fSBarry Smith 4164a2ae208SSatish Balay #undef __FUNCT__ 4174dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 4184dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4198499736aSShri Abhyankar { 4208499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4218499736aSShri Abhyankar PetscErrorCode ierr; 422b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 4238499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 424b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 425b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 426b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 427b3260449SShri Abhyankar const PetscScalar *b; 4288499736aSShri Abhyankar 4298499736aSShri Abhyankar PetscFunctionBegin; 4308499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4313649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4328499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4338499736aSShri Abhyankar 4348499736aSShri Abhyankar /* forward solve the U^T */ 4358499736aSShri Abhyankar idx = 0; 4368499736aSShri Abhyankar for (i=0; i<n; i++) { 4378499736aSShri Abhyankar v = aa + bs2*diag[i]; 4388499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 4398499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 4408499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 4418499736aSShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 4428499736aSShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 4438499736aSShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 4448499736aSShri Abhyankar v -= bs2; 4458499736aSShri Abhyankar 4468499736aSShri Abhyankar vi = aj + diag[i] - 1; 4478499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 4488499736aSShri Abhyankar for(j=0;j>-nz;j--){ 4498499736aSShri Abhyankar oidx = bs*vi[j]; 4508499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4518499736aSShri Abhyankar x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4528499736aSShri Abhyankar x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4538499736aSShri Abhyankar x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4548499736aSShri Abhyankar v -= bs2; 4558499736aSShri Abhyankar } 4568499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 4578499736aSShri Abhyankar idx += bs; 4588499736aSShri Abhyankar } 4598499736aSShri Abhyankar /* backward solve the L^T */ 4608499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 4618499736aSShri Abhyankar v = aa + bs2*ai[i]; 4628499736aSShri Abhyankar vi = aj + ai[i]; 4638499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 4648499736aSShri Abhyankar idt = bs*i; 4658499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 4668499736aSShri Abhyankar for(j=0;j<nz;j++){ 4678499736aSShri Abhyankar idx = bs*vi[j]; 4688499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 4698499736aSShri Abhyankar x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 4708499736aSShri Abhyankar x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 4718499736aSShri Abhyankar x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 4728499736aSShri Abhyankar v += bs2; 4738499736aSShri Abhyankar } 4748499736aSShri Abhyankar } 4753649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4768499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4778499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4788499736aSShri Abhyankar PetscFunctionReturn(0); 4798499736aSShri Abhyankar } 4808499736aSShri Abhyankar 4818499736aSShri Abhyankar #undef __FUNCT__ 48206e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 48306e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 484f1af5d2fSBarry Smith { 485f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 486dfbe8321SBarry Smith PetscErrorCode ierr; 487b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 488b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 489b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 490b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 491b3260449SShri Abhyankar const PetscScalar *b; 492f1af5d2fSBarry Smith 493f1af5d2fSBarry Smith PetscFunctionBegin; 494ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4953649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4961ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 497f1af5d2fSBarry Smith 498f1af5d2fSBarry Smith /* forward solve the U^T */ 499f1af5d2fSBarry Smith idx = 0; 500f1af5d2fSBarry Smith for (i=0; i<n; i++) { 501f1af5d2fSBarry Smith 502f1af5d2fSBarry Smith v = aa + 25*diag[i]; 503f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 504ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 505f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 506f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 507f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 508f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 509f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 510f1af5d2fSBarry Smith v += 25; 511f1af5d2fSBarry Smith 512f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 513f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 514f1af5d2fSBarry Smith while (nz--) { 515f1af5d2fSBarry Smith oidx = 5*(*vi++); 516f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 517f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 518f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 519f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 520f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 521f1af5d2fSBarry Smith v += 25; 522f1af5d2fSBarry Smith } 523f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 524f1af5d2fSBarry Smith idx += 5; 525f1af5d2fSBarry Smith } 526f1af5d2fSBarry Smith /* backward solve the L^T */ 527f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 528f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 529f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 530f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 531f1af5d2fSBarry Smith idt = 5*i; 532f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 533f1af5d2fSBarry Smith while (nz--) { 534f1af5d2fSBarry Smith idx = 5*(*vi--); 535f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 536f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 537f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 538f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 539f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 540f1af5d2fSBarry Smith v -= 25; 541f1af5d2fSBarry Smith } 542f1af5d2fSBarry Smith } 5433649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5441ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 545dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 546f1af5d2fSBarry Smith PetscFunctionReturn(0); 547f1af5d2fSBarry Smith } 548f1af5d2fSBarry Smith 5494a2ae208SSatish Balay #undef __FUNCT__ 5504dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 5514dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 5528499736aSShri Abhyankar { 5538499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5548499736aSShri Abhyankar PetscErrorCode ierr; 555b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5568499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 557b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 558b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 559b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 560b3260449SShri Abhyankar const PetscScalar *b; 5618499736aSShri Abhyankar 5628499736aSShri Abhyankar PetscFunctionBegin; 5638499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 5643649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5658499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5668499736aSShri Abhyankar 5678499736aSShri Abhyankar /* forward solve the U^T */ 5688499736aSShri Abhyankar idx = 0; 5698499736aSShri Abhyankar for (i=0; i<n; i++) { 5708499736aSShri Abhyankar v = aa + bs2*diag[i]; 5718499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 5728499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 5738499736aSShri Abhyankar x5 = x[4+idx]; 5748499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 5758499736aSShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 5768499736aSShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 5778499736aSShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 5788499736aSShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 5798499736aSShri Abhyankar v -= bs2; 5808499736aSShri Abhyankar 5818499736aSShri Abhyankar vi = aj + diag[i] - 1; 5828499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 5838499736aSShri Abhyankar for(j=0;j>-nz;j--){ 5848499736aSShri Abhyankar oidx = bs*vi[j]; 5858499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 5868499736aSShri Abhyankar x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 5878499736aSShri Abhyankar x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 5888499736aSShri Abhyankar x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 5898499736aSShri Abhyankar x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 5908499736aSShri Abhyankar v -= bs2; 5918499736aSShri Abhyankar } 5928499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 5938499736aSShri Abhyankar idx += bs; 5948499736aSShri Abhyankar } 5958499736aSShri Abhyankar /* backward solve the L^T */ 5968499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 5978499736aSShri Abhyankar v = aa + bs2*ai[i]; 5988499736aSShri Abhyankar vi = aj + ai[i]; 5998499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 6008499736aSShri Abhyankar idt = bs*i; 6018499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 6028499736aSShri Abhyankar for(j=0;j<nz;j++){ 6038499736aSShri Abhyankar idx = bs*vi[j]; 6048499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 6058499736aSShri Abhyankar x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 6068499736aSShri Abhyankar x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 6078499736aSShri Abhyankar x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 6088499736aSShri Abhyankar x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 6098499736aSShri Abhyankar v += bs2; 6108499736aSShri Abhyankar } 6118499736aSShri Abhyankar } 6123649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 6138499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 6148499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 6158499736aSShri Abhyankar PetscFunctionReturn(0); 6168499736aSShri Abhyankar } 6178499736aSShri Abhyankar 6188499736aSShri Abhyankar #undef __FUNCT__ 61906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 62006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 621f1af5d2fSBarry Smith { 622f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 623dfbe8321SBarry Smith PetscErrorCode ierr; 624b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 625b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 626b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 627b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 628b3260449SShri Abhyankar const PetscScalar *b; 629f1af5d2fSBarry Smith 630f1af5d2fSBarry Smith PetscFunctionBegin; 631ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 6323649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 6331ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 634f1af5d2fSBarry Smith 635f1af5d2fSBarry Smith /* forward solve the U^T */ 636f1af5d2fSBarry Smith idx = 0; 637f1af5d2fSBarry Smith for (i=0; i<n; i++) { 638f1af5d2fSBarry Smith 639f1af5d2fSBarry Smith v = aa + 36*diag[i]; 640f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 641ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 642ef66eb69SBarry Smith x6 = x[5+idx]; 643f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 644f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 645f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 646f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 647f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 648f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 649f1af5d2fSBarry Smith v += 36; 650f1af5d2fSBarry Smith 651f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 652f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 653f1af5d2fSBarry Smith while (nz--) { 654f1af5d2fSBarry Smith oidx = 6*(*vi++); 655f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 656f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 657f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 658f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 659f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 660f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 661f1af5d2fSBarry Smith v += 36; 662f1af5d2fSBarry Smith } 663f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 664f1af5d2fSBarry Smith x[5+idx] = s6; 665f1af5d2fSBarry Smith idx += 6; 666f1af5d2fSBarry Smith } 667f1af5d2fSBarry Smith /* backward solve the L^T */ 668f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 669f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 670f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 671f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 672f1af5d2fSBarry Smith idt = 6*i; 673f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 674f1af5d2fSBarry Smith s6 = x[5+idt]; 675f1af5d2fSBarry Smith while (nz--) { 676f1af5d2fSBarry Smith idx = 6*(*vi--); 677f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 678f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 679f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 680f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 681f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 682f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 683f1af5d2fSBarry Smith v -= 36; 684f1af5d2fSBarry Smith } 685f1af5d2fSBarry Smith } 6863649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 6871ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 688dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 689f1af5d2fSBarry Smith PetscFunctionReturn(0); 690f1af5d2fSBarry Smith } 691f1af5d2fSBarry Smith 6924a2ae208SSatish Balay #undef __FUNCT__ 6934dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 6944dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 6958499736aSShri Abhyankar { 6968499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 6978499736aSShri Abhyankar PetscErrorCode ierr; 698b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 6998499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 700b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 701b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 702b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 703b3260449SShri Abhyankar const PetscScalar *b; 7048499736aSShri Abhyankar 7058499736aSShri Abhyankar PetscFunctionBegin; 7068499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 7073649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 7088499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 7098499736aSShri Abhyankar 7108499736aSShri Abhyankar /* forward solve the U^T */ 7118499736aSShri Abhyankar idx = 0; 7128499736aSShri Abhyankar for (i=0; i<n; i++) { 7138499736aSShri Abhyankar v = aa + bs2*diag[i]; 7148499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 7158499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 7168499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; 7178499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 7188499736aSShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 7198499736aSShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 7208499736aSShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 7218499736aSShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 7228499736aSShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 7238499736aSShri Abhyankar v -= bs2; 7248499736aSShri Abhyankar 7258499736aSShri Abhyankar vi = aj + diag[i] - 1; 7268499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 7278499736aSShri Abhyankar for(j=0;j>-nz;j--){ 7288499736aSShri Abhyankar oidx = bs*vi[j]; 7298499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 7308499736aSShri Abhyankar x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 7318499736aSShri Abhyankar x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 7328499736aSShri Abhyankar x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 7338499736aSShri Abhyankar x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 7348499736aSShri Abhyankar x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 7358499736aSShri Abhyankar v -= bs2; 7368499736aSShri Abhyankar } 7378499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 7388499736aSShri Abhyankar x[5+idx] = s6; 7398499736aSShri Abhyankar idx += bs; 7408499736aSShri Abhyankar } 7418499736aSShri Abhyankar /* backward solve the L^T */ 7428499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 7438499736aSShri Abhyankar v = aa + bs2*ai[i]; 7448499736aSShri Abhyankar vi = aj + ai[i]; 7458499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 7468499736aSShri Abhyankar idt = bs*i; 7478499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 7488499736aSShri Abhyankar s6 = x[5+idt]; 7498499736aSShri Abhyankar for(j=0;j<nz;j++){ 7508499736aSShri Abhyankar idx = bs*vi[j]; 7518499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 7528499736aSShri Abhyankar x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 7538499736aSShri Abhyankar x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 7548499736aSShri Abhyankar x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 7558499736aSShri Abhyankar x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 7568499736aSShri Abhyankar x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 7578499736aSShri Abhyankar v += bs2; 7588499736aSShri Abhyankar } 7598499736aSShri Abhyankar } 7603649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 7618499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 7628499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 7638499736aSShri Abhyankar PetscFunctionReturn(0); 7648499736aSShri Abhyankar } 7658499736aSShri Abhyankar 7668499736aSShri Abhyankar #undef __FUNCT__ 76706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 76806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 769f1af5d2fSBarry Smith { 770f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 771dfbe8321SBarry Smith PetscErrorCode ierr; 772b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 773b3260449SShri Abhyankar PetscInt i,nz,idx,idt,oidx; 774b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 775b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 776b3260449SShri Abhyankar const PetscScalar *b; 777f1af5d2fSBarry Smith 778f1af5d2fSBarry Smith PetscFunctionBegin; 779ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 7803649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 7811ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 782f1af5d2fSBarry Smith 783f1af5d2fSBarry Smith /* forward solve the U^T */ 784f1af5d2fSBarry Smith idx = 0; 785f1af5d2fSBarry Smith for (i=0; i<n; i++) { 786f1af5d2fSBarry Smith 787f1af5d2fSBarry Smith v = aa + 49*diag[i]; 788f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 789ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 790ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 791f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 792f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 793f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 794f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 795f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 796f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 797f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 798f1af5d2fSBarry Smith v += 49; 799f1af5d2fSBarry Smith 800f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 801f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 802f1af5d2fSBarry Smith while (nz--) { 803f1af5d2fSBarry Smith oidx = 7*(*vi++); 804f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 805f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 806f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 807f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 808f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 809f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 810f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 811f1af5d2fSBarry Smith v += 49; 812f1af5d2fSBarry Smith } 813f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 814f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 815f1af5d2fSBarry Smith idx += 7; 816f1af5d2fSBarry Smith } 817f1af5d2fSBarry Smith /* backward solve the L^T */ 818f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 819f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 820f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 821f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 822f1af5d2fSBarry Smith idt = 7*i; 823f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 824f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 825f1af5d2fSBarry Smith while (nz--) { 826f1af5d2fSBarry Smith idx = 7*(*vi--); 827f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 828f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 829f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 830f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 831f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 832f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 833f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 834f1af5d2fSBarry Smith v -= 49; 835f1af5d2fSBarry Smith } 836f1af5d2fSBarry Smith } 8373649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 8381ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 839dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 840f1af5d2fSBarry Smith PetscFunctionReturn(0); 841f1af5d2fSBarry Smith } 8428499736aSShri Abhyankar #undef __FUNCT__ 8434dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 8444dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 8458499736aSShri Abhyankar { 8468499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 8478499736aSShri Abhyankar PetscErrorCode ierr; 848b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 8498499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 850b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 851b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 852b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 853b3260449SShri Abhyankar const PetscScalar *b; 8548499736aSShri Abhyankar 8558499736aSShri Abhyankar PetscFunctionBegin; 8568499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 8573649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 8588499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 8598499736aSShri Abhyankar 8608499736aSShri Abhyankar /* forward solve the U^T */ 8618499736aSShri Abhyankar idx = 0; 8628499736aSShri Abhyankar for (i=0; i<n; i++) { 8638499736aSShri Abhyankar v = aa + bs2*diag[i]; 8648499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 8658499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 8668499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 8678499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 8688499736aSShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 8698499736aSShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 8708499736aSShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 8718499736aSShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 8728499736aSShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 8738499736aSShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 8748499736aSShri Abhyankar v -= bs2; 8758499736aSShri Abhyankar vi = aj + diag[i] - 1; 8768499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 8778499736aSShri Abhyankar for(j=0;j>-nz;j--){ 8788499736aSShri Abhyankar oidx = bs*vi[j]; 8798499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 8808499736aSShri Abhyankar x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 8818499736aSShri Abhyankar x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 8828499736aSShri Abhyankar x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 8838499736aSShri Abhyankar x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 8848499736aSShri Abhyankar x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 8858499736aSShri Abhyankar x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 8868499736aSShri Abhyankar v -= bs2; 8878499736aSShri Abhyankar } 8888499736aSShri Abhyankar x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 8898499736aSShri Abhyankar x[5+idx] = s6; x[6+idx] = s7; 8908499736aSShri Abhyankar idx += bs; 8918499736aSShri Abhyankar } 8928499736aSShri Abhyankar /* backward solve the L^T */ 8938499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 8948499736aSShri Abhyankar v = aa + bs2*ai[i]; 8958499736aSShri Abhyankar vi = aj + ai[i]; 8968499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 8978499736aSShri Abhyankar idt = bs*i; 8988499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 8998499736aSShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; 9008499736aSShri Abhyankar for(j=0;j<nz;j++){ 9018499736aSShri Abhyankar idx = bs*vi[j]; 9028499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 9038499736aSShri Abhyankar x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 9048499736aSShri Abhyankar x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 9058499736aSShri Abhyankar x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 9068499736aSShri Abhyankar x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 9078499736aSShri Abhyankar x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 9088499736aSShri Abhyankar x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 9098499736aSShri Abhyankar v += bs2; 9108499736aSShri Abhyankar } 9118499736aSShri Abhyankar } 9123649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 9138499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 9148499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 9158499736aSShri Abhyankar PetscFunctionReturn(0); 9168499736aSShri Abhyankar } 917f1af5d2fSBarry Smith 918f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 9194a2ae208SSatish Balay #undef __FUNCT__ 92093fd935bSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 92193fd935bSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 92293fd935bSShri Abhyankar { 92393fd935bSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 92493fd935bSShri Abhyankar IS iscol = a->col,isrow = a->row; 92593fd935bSShri Abhyankar PetscErrorCode ierr; 92693fd935bSShri Abhyankar const PetscInt *rout,*cout,*r,*c,*adiag = a->diag,*ai = a->i,*aj = a->j,*vi; 92793fd935bSShri Abhyankar PetscInt i,n = a->mbs,j; 92893fd935bSShri Abhyankar PetscInt nz; 92993fd935bSShri Abhyankar PetscScalar *x,*tmp,s1; 93093fd935bSShri Abhyankar const MatScalar *aa = a->a,*v; 93193fd935bSShri Abhyankar const PetscScalar *b; 93293fd935bSShri Abhyankar 93393fd935bSShri Abhyankar PetscFunctionBegin; 9343649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 93593fd935bSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 93693fd935bSShri Abhyankar tmp = a->solve_work; 93793fd935bSShri Abhyankar 93893fd935bSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 93993fd935bSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 94093fd935bSShri Abhyankar 94193fd935bSShri Abhyankar /* copy the b into temp work space according to permutation */ 94293fd935bSShri Abhyankar for (i=0; i<n; i++) tmp[i] = b[c[i]]; 94393fd935bSShri Abhyankar 94493fd935bSShri Abhyankar /* forward solve the U^T */ 94593fd935bSShri Abhyankar for (i=0; i<n; i++) { 94693fd935bSShri Abhyankar v = aa + adiag[i+1] + 1; 94793fd935bSShri Abhyankar vi = aj + adiag[i+1] + 1; 94893fd935bSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 94993fd935bSShri Abhyankar s1 = tmp[i]; 95093fd935bSShri Abhyankar s1 *= v[nz]; /* multiply by inverse of diagonal entry */ 95193fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 95293fd935bSShri Abhyankar tmp[i] = s1; 95393fd935bSShri Abhyankar } 95493fd935bSShri Abhyankar 95593fd935bSShri Abhyankar /* backward solve the L^T */ 95693fd935bSShri Abhyankar for (i=n-1; i>=0; i--){ 95793fd935bSShri Abhyankar v = aa + ai[i]; 95893fd935bSShri Abhyankar vi = aj + ai[i]; 95993fd935bSShri Abhyankar nz = ai[i+1] - ai[i]; 96093fd935bSShri Abhyankar s1 = tmp[i]; 96193fd935bSShri Abhyankar for (j=0; j<nz; j++) tmp[vi[j]] -= s1*v[j]; 96293fd935bSShri Abhyankar } 96393fd935bSShri Abhyankar 96493fd935bSShri Abhyankar /* copy tmp into x according to permutation */ 96593fd935bSShri Abhyankar for (i=0; i<n; i++) x[r[i]] = tmp[i]; 96693fd935bSShri Abhyankar 96793fd935bSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 96893fd935bSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9693649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 97093fd935bSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 97193fd935bSShri Abhyankar 97293fd935bSShri Abhyankar ierr = PetscLogFlops(2.0*a->nz-A->cmap->n);CHKERRQ(ierr); 97393fd935bSShri Abhyankar PetscFunctionReturn(0); 97493fd935bSShri Abhyankar } 97593fd935bSShri Abhyankar 97693fd935bSShri Abhyankar #undef __FUNCT__ 97706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 97806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 979f1af5d2fSBarry Smith { 980f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 981f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9826849ba73SBarry Smith PetscErrorCode ierr; 9835d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 984b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 985b3260449SShri Abhyankar PetscInt i,nz; 986b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 987b3260449SShri Abhyankar PetscScalar s1,*x,*t; 988b3260449SShri Abhyankar const PetscScalar *b; 989f1af5d2fSBarry Smith 990f1af5d2fSBarry Smith PetscFunctionBegin; 9913649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 9921ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 993f1af5d2fSBarry Smith t = a->solve_work; 994f1af5d2fSBarry Smith 995f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 996f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 997f1af5d2fSBarry Smith 998f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 999f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1000f1af5d2fSBarry Smith t[i] = b[c[i]]; 1001f1af5d2fSBarry Smith } 1002f1af5d2fSBarry Smith 1003f1af5d2fSBarry Smith /* forward solve the U^T */ 1004f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1005f1af5d2fSBarry Smith 1006f1af5d2fSBarry Smith v = aa + diag[i]; 1007f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1008f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 1009f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1010f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1011f1af5d2fSBarry Smith while (nz--) { 1012f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 1013f1af5d2fSBarry Smith } 1014f1af5d2fSBarry Smith t[i] = s1; 1015f1af5d2fSBarry Smith } 1016f1af5d2fSBarry Smith /* backward solve the L^T */ 1017f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1018f1af5d2fSBarry Smith v = aa + diag[i] - 1; 1019f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1020f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1021f1af5d2fSBarry Smith s1 = t[i]; 1022f1af5d2fSBarry Smith while (nz--) { 1023f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 1024f1af5d2fSBarry Smith } 1025f1af5d2fSBarry Smith } 1026f1af5d2fSBarry Smith 1027f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1028f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1029f1af5d2fSBarry Smith x[r[i]] = t[i]; 1030f1af5d2fSBarry Smith } 1031f1af5d2fSBarry Smith 1032f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1033f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 10343649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 10351ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1036dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 1037f1af5d2fSBarry Smith PetscFunctionReturn(0); 1038f1af5d2fSBarry Smith } 1039f1af5d2fSBarry Smith 10404a2ae208SSatish Balay #undef __FUNCT__ 104106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 104206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 1043f1af5d2fSBarry Smith { 1044f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1045f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10466849ba73SBarry Smith PetscErrorCode ierr; 10475d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1048b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1049b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1050b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1051b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x,*t; 1052b3260449SShri Abhyankar const PetscScalar *b; 1053f1af5d2fSBarry Smith 1054f1af5d2fSBarry Smith PetscFunctionBegin; 10553649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 10561ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1057f1af5d2fSBarry Smith t = a->solve_work; 1058f1af5d2fSBarry Smith 1059f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1060f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1061f1af5d2fSBarry Smith 1062f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1063f1af5d2fSBarry Smith ii = 0; 1064f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1065f1af5d2fSBarry Smith ic = 2*c[i]; 1066f1af5d2fSBarry Smith t[ii] = b[ic]; 1067f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1068f1af5d2fSBarry Smith ii += 2; 1069f1af5d2fSBarry Smith } 1070f1af5d2fSBarry Smith 1071f1af5d2fSBarry Smith /* forward solve the U^T */ 1072f1af5d2fSBarry Smith idx = 0; 1073f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1074f1af5d2fSBarry Smith 1075f1af5d2fSBarry Smith v = aa + 4*diag[i]; 1076f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1077f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1078f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 1079f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 1080f1af5d2fSBarry Smith v += 4; 1081f1af5d2fSBarry Smith 1082f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1083f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1084f1af5d2fSBarry Smith while (nz--) { 1085f1af5d2fSBarry Smith oidx = 2*(*vi++); 1086f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 1087f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 1088f1af5d2fSBarry Smith v += 4; 1089f1af5d2fSBarry Smith } 1090f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1091f1af5d2fSBarry Smith idx += 2; 1092f1af5d2fSBarry Smith } 1093f1af5d2fSBarry Smith /* backward solve the L^T */ 1094f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1095f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 1096f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1097f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1098f1af5d2fSBarry Smith idt = 2*i; 1099f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1100f1af5d2fSBarry Smith while (nz--) { 1101f1af5d2fSBarry Smith idx = 2*(*vi--); 1102f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 1103f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 1104f1af5d2fSBarry Smith v -= 4; 1105f1af5d2fSBarry Smith } 1106f1af5d2fSBarry Smith } 1107f1af5d2fSBarry Smith 1108f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1109f1af5d2fSBarry Smith ii = 0; 1110f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1111f1af5d2fSBarry Smith ir = 2*r[i]; 1112f1af5d2fSBarry Smith x[ir] = t[ii]; 1113f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1114f1af5d2fSBarry Smith ii += 2; 1115f1af5d2fSBarry Smith } 1116f1af5d2fSBarry Smith 1117f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1118f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11193649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 11201ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1121dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1122f1af5d2fSBarry Smith PetscFunctionReturn(0); 1123f1af5d2fSBarry Smith } 1124f1af5d2fSBarry Smith 11254a2ae208SSatish Balay #undef __FUNCT__ 11264dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 11274dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 112832121132SShri Abhyankar { 112932121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 113032121132SShri Abhyankar PetscErrorCode ierr; 113132121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1132b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 113332121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 113432121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1135b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1136b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1137b3260449SShri Abhyankar PetscScalar s1,s2,x1,x2,*x,*t; 1138b3260449SShri Abhyankar const PetscScalar *b; 113932121132SShri Abhyankar 114032121132SShri Abhyankar PetscFunctionBegin; 11413649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 114232121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 114332121132SShri Abhyankar t = a->solve_work; 114432121132SShri Abhyankar 114532121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 114632121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 114732121132SShri Abhyankar 114832121132SShri Abhyankar /* copy b into temp work space according to permutation */ 114932121132SShri Abhyankar for(i=0;i<n;i++){ 115032121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 115132121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; 115232121132SShri Abhyankar } 115332121132SShri Abhyankar 115432121132SShri Abhyankar /* forward solve the U^T */ 115532121132SShri Abhyankar idx = 0; 115632121132SShri Abhyankar for (i=0; i<n; i++) { 115732121132SShri Abhyankar v = aa + bs2*diag[i]; 115832121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 115932121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 116032121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 116132121132SShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 116232121132SShri Abhyankar v -= bs2; 116332121132SShri Abhyankar 116432121132SShri Abhyankar vi = aj + diag[i] - 1; 116532121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 116632121132SShri Abhyankar for(j=0;j>-nz;j--){ 116732121132SShri Abhyankar oidx = bs*vi[j]; 116832121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2; 116932121132SShri Abhyankar t[oidx+1] -= v[2]*s1 + v[3]*s2; 117032121132SShri Abhyankar v -= bs2; 117132121132SShri Abhyankar } 117232121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 117332121132SShri Abhyankar idx += bs; 117432121132SShri Abhyankar } 117532121132SShri Abhyankar /* backward solve the L^T */ 117632121132SShri Abhyankar for (i=n-1; i>=0; i--){ 117732121132SShri Abhyankar v = aa + bs2*ai[i]; 117832121132SShri Abhyankar vi = aj + ai[i]; 117932121132SShri Abhyankar nz = ai[i+1] - ai[i]; 118032121132SShri Abhyankar idt = bs*i; 118132121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 118232121132SShri Abhyankar for(j=0;j<nz;j++){ 118332121132SShri Abhyankar idx = bs*vi[j]; 118432121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2; 118532121132SShri Abhyankar t[idx+1] -= v[2]*s1 + v[3]*s2; 118632121132SShri Abhyankar v += bs2; 118732121132SShri Abhyankar } 118832121132SShri Abhyankar } 118932121132SShri Abhyankar 119032121132SShri Abhyankar /* copy t into x according to permutation */ 119132121132SShri Abhyankar for(i=0;i<n;i++){ 119232121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 119332121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; 119432121132SShri Abhyankar } 119532121132SShri Abhyankar 119632121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 119732121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11983649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 119932121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 120032121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 120132121132SShri Abhyankar PetscFunctionReturn(0); 120232121132SShri Abhyankar } 120332121132SShri Abhyankar 120432121132SShri Abhyankar #undef __FUNCT__ 120506e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 120606e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1207f1af5d2fSBarry Smith { 1208f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1209f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 12106849ba73SBarry Smith PetscErrorCode ierr; 12115d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1212b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1213b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1214b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1215b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1216b3260449SShri Abhyankar const PetscScalar *b; 1217f1af5d2fSBarry Smith 1218f1af5d2fSBarry Smith PetscFunctionBegin; 12193649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 12201ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1221f1af5d2fSBarry Smith t = a->solve_work; 1222f1af5d2fSBarry Smith 1223f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1224f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1225f1af5d2fSBarry Smith 1226f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1227f1af5d2fSBarry Smith ii = 0; 1228f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1229f1af5d2fSBarry Smith ic = 3*c[i]; 1230f1af5d2fSBarry Smith t[ii] = b[ic]; 1231f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1232f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1233f1af5d2fSBarry Smith ii += 3; 1234f1af5d2fSBarry Smith } 1235f1af5d2fSBarry Smith 1236f1af5d2fSBarry Smith /* forward solve the U^T */ 1237f1af5d2fSBarry Smith idx = 0; 1238f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1239f1af5d2fSBarry Smith 1240f1af5d2fSBarry Smith v = aa + 9*diag[i]; 1241f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1242f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1243f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1244f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1245f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1246f1af5d2fSBarry Smith v += 9; 1247f1af5d2fSBarry Smith 1248f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1249f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1250f1af5d2fSBarry Smith while (nz--) { 1251f1af5d2fSBarry Smith oidx = 3*(*vi++); 1252f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1253f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1254f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1255f1af5d2fSBarry Smith v += 9; 1256f1af5d2fSBarry Smith } 1257f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1258f1af5d2fSBarry Smith idx += 3; 1259f1af5d2fSBarry Smith } 1260f1af5d2fSBarry Smith /* backward solve the L^T */ 1261f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1262f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 1263f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1264f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1265f1af5d2fSBarry Smith idt = 3*i; 1266f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1267f1af5d2fSBarry Smith while (nz--) { 1268f1af5d2fSBarry Smith idx = 3*(*vi--); 1269f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1270f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1271f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1272f1af5d2fSBarry Smith v -= 9; 1273f1af5d2fSBarry Smith } 1274f1af5d2fSBarry Smith } 1275f1af5d2fSBarry Smith 1276f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1277f1af5d2fSBarry Smith ii = 0; 1278f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1279f1af5d2fSBarry Smith ir = 3*r[i]; 1280f1af5d2fSBarry Smith x[ir] = t[ii]; 1281f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1282f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1283f1af5d2fSBarry Smith ii += 3; 1284f1af5d2fSBarry Smith } 1285f1af5d2fSBarry Smith 1286f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1287f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12883649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 12891ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1290dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1291f1af5d2fSBarry Smith PetscFunctionReturn(0); 1292f1af5d2fSBarry Smith } 1293f1af5d2fSBarry Smith 12944a2ae208SSatish Balay #undef __FUNCT__ 12954dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 12964dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 129732121132SShri Abhyankar { 129832121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 129932121132SShri Abhyankar PetscErrorCode ierr; 130032121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1301b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 130232121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 130332121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1304b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1305b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1306b3260449SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1307b3260449SShri Abhyankar const PetscScalar *b; 130832121132SShri Abhyankar 130932121132SShri Abhyankar PetscFunctionBegin; 13103649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 131132121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 131232121132SShri Abhyankar t = a->solve_work; 131332121132SShri Abhyankar 131432121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 131532121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 131632121132SShri Abhyankar 131732121132SShri Abhyankar /* copy b into temp work space according to permutation */ 131832121132SShri Abhyankar for(i=0;i<n;i++){ 131932121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 132032121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 132132121132SShri Abhyankar } 132232121132SShri Abhyankar 132332121132SShri Abhyankar /* forward solve the U^T */ 132432121132SShri Abhyankar idx = 0; 132532121132SShri Abhyankar for (i=0; i<n; i++) { 132632121132SShri Abhyankar v = aa + bs2*diag[i]; 132732121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 132832121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 132932121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 133032121132SShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 133132121132SShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 133232121132SShri Abhyankar v -= bs2; 133332121132SShri Abhyankar 133432121132SShri Abhyankar vi = aj + diag[i] - 1; 133532121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 133632121132SShri Abhyankar for(j=0;j>-nz;j--){ 133732121132SShri Abhyankar oidx = bs*vi[j]; 133832121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 133932121132SShri Abhyankar t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 134032121132SShri Abhyankar t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 134132121132SShri Abhyankar v -= bs2; 134232121132SShri Abhyankar } 134332121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 134432121132SShri Abhyankar idx += bs; 134532121132SShri Abhyankar } 134632121132SShri Abhyankar /* backward solve the L^T */ 134732121132SShri Abhyankar for (i=n-1; i>=0; i--){ 134832121132SShri Abhyankar v = aa + bs2*ai[i]; 134932121132SShri Abhyankar vi = aj + ai[i]; 135032121132SShri Abhyankar nz = ai[i+1] - ai[i]; 135132121132SShri Abhyankar idt = bs*i; 135232121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 135332121132SShri Abhyankar for(j=0;j<nz;j++){ 135432121132SShri Abhyankar idx = bs*vi[j]; 135532121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 135632121132SShri Abhyankar t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 135732121132SShri Abhyankar t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 135832121132SShri Abhyankar v += bs2; 135932121132SShri Abhyankar } 136032121132SShri Abhyankar } 136132121132SShri Abhyankar 136232121132SShri Abhyankar /* copy t into x according to permutation */ 136332121132SShri Abhyankar for(i=0;i<n;i++){ 136432121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 136532121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 136632121132SShri Abhyankar } 136732121132SShri Abhyankar 136832121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 136932121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 13703649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 137132121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 137232121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 137332121132SShri Abhyankar PetscFunctionReturn(0); 137432121132SShri Abhyankar } 137532121132SShri Abhyankar 137632121132SShri Abhyankar #undef __FUNCT__ 137706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 137806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1379f1af5d2fSBarry Smith { 1380f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1381f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 13826849ba73SBarry Smith PetscErrorCode ierr; 13835d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1384b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1385b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1386b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1387b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1388b3260449SShri Abhyankar const PetscScalar *b; 1389f1af5d2fSBarry Smith 1390f1af5d2fSBarry Smith PetscFunctionBegin; 13913649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 13921ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1393f1af5d2fSBarry Smith t = a->solve_work; 1394f1af5d2fSBarry Smith 1395f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1396f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1397f1af5d2fSBarry Smith 1398f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1399f1af5d2fSBarry Smith ii = 0; 1400f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1401f1af5d2fSBarry Smith ic = 4*c[i]; 1402f1af5d2fSBarry Smith t[ii] = b[ic]; 1403f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1404f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1405f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1406f1af5d2fSBarry Smith ii += 4; 1407f1af5d2fSBarry Smith } 1408f1af5d2fSBarry Smith 1409f1af5d2fSBarry Smith /* forward solve the U^T */ 1410f1af5d2fSBarry Smith idx = 0; 1411f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1412f1af5d2fSBarry Smith 1413f1af5d2fSBarry Smith v = aa + 16*diag[i]; 1414f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1415f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1416f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1417f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1418f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1419f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1420f1af5d2fSBarry Smith v += 16; 1421f1af5d2fSBarry Smith 1422f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1423f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1424f1af5d2fSBarry Smith while (nz--) { 1425f1af5d2fSBarry Smith oidx = 4*(*vi++); 1426f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1427f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1428f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1429f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1430f1af5d2fSBarry Smith v += 16; 1431f1af5d2fSBarry Smith } 1432f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1433f1af5d2fSBarry Smith idx += 4; 1434f1af5d2fSBarry Smith } 1435f1af5d2fSBarry Smith /* backward solve the L^T */ 1436f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1437f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 1438f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1439f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1440f1af5d2fSBarry Smith idt = 4*i; 1441f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1442f1af5d2fSBarry Smith while (nz--) { 1443f1af5d2fSBarry Smith idx = 4*(*vi--); 1444f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1445f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1446f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1447f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1448f1af5d2fSBarry Smith v -= 16; 1449f1af5d2fSBarry Smith } 1450f1af5d2fSBarry Smith } 1451f1af5d2fSBarry Smith 1452f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1453f1af5d2fSBarry Smith ii = 0; 1454f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1455f1af5d2fSBarry Smith ir = 4*r[i]; 1456f1af5d2fSBarry Smith x[ir] = t[ii]; 1457f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1458f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1459f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1460f1af5d2fSBarry Smith ii += 4; 1461f1af5d2fSBarry Smith } 1462f1af5d2fSBarry Smith 1463f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1464f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 14653649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 14661ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1467dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1468f1af5d2fSBarry Smith PetscFunctionReturn(0); 1469f1af5d2fSBarry Smith } 1470f1af5d2fSBarry Smith 14714a2ae208SSatish Balay #undef __FUNCT__ 14724dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 14734dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 147432121132SShri Abhyankar { 147532121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 147632121132SShri Abhyankar PetscErrorCode ierr; 147732121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1478b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 147932121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 148032121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1481b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1482b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1483b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1484b3260449SShri Abhyankar const PetscScalar *b; 148532121132SShri Abhyankar 148632121132SShri Abhyankar PetscFunctionBegin; 14873649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 148832121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 148932121132SShri Abhyankar t = a->solve_work; 149032121132SShri Abhyankar 149132121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 149232121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 149332121132SShri Abhyankar 149432121132SShri Abhyankar /* copy b into temp work space according to permutation */ 149532121132SShri Abhyankar for(i=0;i<n;i++){ 149632121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 149732121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 149832121132SShri Abhyankar } 149932121132SShri Abhyankar 150032121132SShri Abhyankar /* forward solve the U^T */ 150132121132SShri Abhyankar idx = 0; 150232121132SShri Abhyankar for (i=0; i<n; i++) { 150332121132SShri Abhyankar v = aa + bs2*diag[i]; 150432121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 150532121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 150632121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 150732121132SShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 150832121132SShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 150932121132SShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 151032121132SShri Abhyankar v -= bs2; 151132121132SShri Abhyankar 151232121132SShri Abhyankar vi = aj + diag[i] - 1; 151332121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 151432121132SShri Abhyankar for(j=0;j>-nz;j--){ 151532121132SShri Abhyankar oidx = bs*vi[j]; 151632121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 151732121132SShri Abhyankar t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 151832121132SShri Abhyankar t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 151932121132SShri Abhyankar t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 152032121132SShri Abhyankar v -= bs2; 152132121132SShri Abhyankar } 152232121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 152332121132SShri Abhyankar idx += bs; 152432121132SShri Abhyankar } 152532121132SShri Abhyankar /* backward solve the L^T */ 152632121132SShri Abhyankar for (i=n-1; i>=0; i--){ 152732121132SShri Abhyankar v = aa + bs2*ai[i]; 152832121132SShri Abhyankar vi = aj + ai[i]; 152932121132SShri Abhyankar nz = ai[i+1] - ai[i]; 153032121132SShri Abhyankar idt = bs*i; 153132121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 153232121132SShri Abhyankar for(j=0;j<nz;j++){ 153332121132SShri Abhyankar idx = bs*vi[j]; 153432121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 153532121132SShri Abhyankar t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 153632121132SShri Abhyankar t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 153732121132SShri Abhyankar t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 153832121132SShri Abhyankar v += bs2; 153932121132SShri Abhyankar } 154032121132SShri Abhyankar } 154132121132SShri Abhyankar 154232121132SShri Abhyankar /* copy t into x according to permutation */ 154332121132SShri Abhyankar for(i=0;i<n;i++){ 154432121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 154532121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 154632121132SShri Abhyankar } 154732121132SShri Abhyankar 154832121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 154932121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 15503649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 155132121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 155232121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 155332121132SShri Abhyankar PetscFunctionReturn(0); 155432121132SShri Abhyankar } 155532121132SShri Abhyankar 155632121132SShri Abhyankar #undef __FUNCT__ 155706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 155806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1559f1af5d2fSBarry Smith { 1560f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1561f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 15626849ba73SBarry Smith PetscErrorCode ierr; 15635d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1564b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1565b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1566b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1567b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1568b3260449SShri Abhyankar const PetscScalar *b; 1569f1af5d2fSBarry Smith 1570f1af5d2fSBarry Smith PetscFunctionBegin; 15713649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 15721ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1573f1af5d2fSBarry Smith t = a->solve_work; 1574f1af5d2fSBarry Smith 1575f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1576f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1577f1af5d2fSBarry Smith 1578f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1579f1af5d2fSBarry Smith ii = 0; 1580f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1581f1af5d2fSBarry Smith ic = 5*c[i]; 1582f1af5d2fSBarry Smith t[ii] = b[ic]; 1583f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1584f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1585f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1586f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1587f1af5d2fSBarry Smith ii += 5; 1588f1af5d2fSBarry Smith } 1589f1af5d2fSBarry Smith 1590f1af5d2fSBarry Smith /* forward solve the U^T */ 1591f1af5d2fSBarry Smith idx = 0; 1592f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1593f1af5d2fSBarry Smith 1594f1af5d2fSBarry Smith v = aa + 25*diag[i]; 1595f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1596f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1597f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1598f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1599f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1600f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1601f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1602f1af5d2fSBarry Smith v += 25; 1603f1af5d2fSBarry Smith 1604f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1605f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1606f1af5d2fSBarry Smith while (nz--) { 1607f1af5d2fSBarry Smith oidx = 5*(*vi++); 1608f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1609f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1610f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1611f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1612f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1613f1af5d2fSBarry Smith v += 25; 1614f1af5d2fSBarry Smith } 1615f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1616f1af5d2fSBarry Smith idx += 5; 1617f1af5d2fSBarry Smith } 1618f1af5d2fSBarry Smith /* backward solve the L^T */ 1619f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1620f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 1621f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1622f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1623f1af5d2fSBarry Smith idt = 5*i; 1624f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1625f1af5d2fSBarry Smith while (nz--) { 1626f1af5d2fSBarry Smith idx = 5*(*vi--); 1627f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1628f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1629f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1630f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1631f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1632f1af5d2fSBarry Smith v -= 25; 1633f1af5d2fSBarry Smith } 1634f1af5d2fSBarry Smith } 1635f1af5d2fSBarry Smith 1636f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1637f1af5d2fSBarry Smith ii = 0; 1638f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1639f1af5d2fSBarry Smith ir = 5*r[i]; 1640f1af5d2fSBarry Smith x[ir] = t[ii]; 1641f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1642f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1643f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1644f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1645f1af5d2fSBarry Smith ii += 5; 1646f1af5d2fSBarry Smith } 1647f1af5d2fSBarry Smith 1648f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1649f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 16503649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 16511ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1652dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1653f1af5d2fSBarry Smith PetscFunctionReturn(0); 1654f1af5d2fSBarry Smith } 1655f1af5d2fSBarry Smith 16564a2ae208SSatish Balay #undef __FUNCT__ 16574dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 16584dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 165932121132SShri Abhyankar { 166032121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 166132121132SShri Abhyankar PetscErrorCode ierr; 166232121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1663b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 166432121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 166532121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1666b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1667b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1668b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1669b3260449SShri Abhyankar const PetscScalar *b; 167032121132SShri Abhyankar 167132121132SShri Abhyankar PetscFunctionBegin; 16723649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 167332121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 167432121132SShri Abhyankar t = a->solve_work; 167532121132SShri Abhyankar 167632121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 167732121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 167832121132SShri Abhyankar 167932121132SShri Abhyankar /* copy b into temp work space according to permutation */ 168032121132SShri Abhyankar for(i=0;i<n;i++){ 168132121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 168232121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 168332121132SShri Abhyankar t[ii+4] = b[ic+4]; 168432121132SShri Abhyankar } 168532121132SShri Abhyankar 168632121132SShri Abhyankar /* forward solve the U^T */ 168732121132SShri Abhyankar idx = 0; 168832121132SShri Abhyankar for (i=0; i<n; i++) { 168932121132SShri Abhyankar v = aa + bs2*diag[i]; 169032121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 169132121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 169232121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 169332121132SShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 169432121132SShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 169532121132SShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 169632121132SShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 169732121132SShri Abhyankar v -= bs2; 169832121132SShri Abhyankar 169932121132SShri Abhyankar vi = aj + diag[i] - 1; 170032121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 170132121132SShri Abhyankar for(j=0;j>-nz;j--){ 170232121132SShri Abhyankar oidx = bs*vi[j]; 170332121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 170432121132SShri Abhyankar t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 170532121132SShri Abhyankar t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 170632121132SShri Abhyankar t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 170732121132SShri Abhyankar t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 170832121132SShri Abhyankar v -= bs2; 170932121132SShri Abhyankar } 171032121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 171132121132SShri Abhyankar idx += bs; 171232121132SShri Abhyankar } 171332121132SShri Abhyankar /* backward solve the L^T */ 171432121132SShri Abhyankar for (i=n-1; i>=0; i--){ 171532121132SShri Abhyankar v = aa + bs2*ai[i]; 171632121132SShri Abhyankar vi = aj + ai[i]; 171732121132SShri Abhyankar nz = ai[i+1] - ai[i]; 171832121132SShri Abhyankar idt = bs*i; 171932121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 172032121132SShri Abhyankar for(j=0;j<nz;j++){ 172132121132SShri Abhyankar idx = bs*vi[j]; 172232121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 172332121132SShri Abhyankar t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 172432121132SShri Abhyankar t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 172532121132SShri Abhyankar t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 172632121132SShri Abhyankar t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 172732121132SShri Abhyankar v += bs2; 172832121132SShri Abhyankar } 172932121132SShri Abhyankar } 173032121132SShri Abhyankar 173132121132SShri Abhyankar /* copy t into x according to permutation */ 173232121132SShri Abhyankar for(i=0;i<n;i++){ 173332121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 173432121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 173532121132SShri Abhyankar x[ir+4] = t[ii+4]; 173632121132SShri Abhyankar } 173732121132SShri Abhyankar 173832121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 173932121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 17403649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 174132121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 174232121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 174332121132SShri Abhyankar PetscFunctionReturn(0); 174432121132SShri Abhyankar } 174532121132SShri Abhyankar 174632121132SShri Abhyankar #undef __FUNCT__ 174706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 174806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1749f1af5d2fSBarry Smith { 1750f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1751f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 17526849ba73SBarry Smith PetscErrorCode ierr; 17535d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1754b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1755b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1756b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1757b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1758b3260449SShri Abhyankar const PetscScalar *b; 1759f1af5d2fSBarry Smith 1760f1af5d2fSBarry Smith PetscFunctionBegin; 17613649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 17621ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1763f1af5d2fSBarry Smith t = a->solve_work; 1764f1af5d2fSBarry Smith 1765f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1766f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1767f1af5d2fSBarry Smith 1768f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1769f1af5d2fSBarry Smith ii = 0; 1770f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1771f1af5d2fSBarry Smith ic = 6*c[i]; 1772f1af5d2fSBarry Smith t[ii] = b[ic]; 1773f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1774f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1775f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1776f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1777f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1778f1af5d2fSBarry Smith ii += 6; 1779f1af5d2fSBarry Smith } 1780f1af5d2fSBarry Smith 1781f1af5d2fSBarry Smith /* forward solve the U^T */ 1782f1af5d2fSBarry Smith idx = 0; 1783f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1784f1af5d2fSBarry Smith 1785f1af5d2fSBarry Smith v = aa + 36*diag[i]; 1786f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1787f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1788f1af5d2fSBarry Smith x6 = t[5+idx]; 1789f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1790f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1791f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1792f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1793f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1794f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1795f1af5d2fSBarry Smith v += 36; 1796f1af5d2fSBarry Smith 1797f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1798f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1799f1af5d2fSBarry Smith while (nz--) { 1800f1af5d2fSBarry Smith oidx = 6*(*vi++); 1801f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1802f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1803f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1804f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1805f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1806f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1807f1af5d2fSBarry Smith v += 36; 1808f1af5d2fSBarry Smith } 1809f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1810f1af5d2fSBarry Smith t[5+idx] = s6; 1811f1af5d2fSBarry Smith idx += 6; 1812f1af5d2fSBarry Smith } 1813f1af5d2fSBarry Smith /* backward solve the L^T */ 1814f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1815f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 1816f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1817f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1818f1af5d2fSBarry Smith idt = 6*i; 1819f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1820f1af5d2fSBarry Smith s6 = t[5+idt]; 1821f1af5d2fSBarry Smith while (nz--) { 1822f1af5d2fSBarry Smith idx = 6*(*vi--); 1823f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1824f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1825f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1826f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1827f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1828f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1829f1af5d2fSBarry Smith v -= 36; 1830f1af5d2fSBarry Smith } 1831f1af5d2fSBarry Smith } 1832f1af5d2fSBarry Smith 1833f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1834f1af5d2fSBarry Smith ii = 0; 1835f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1836f1af5d2fSBarry Smith ir = 6*r[i]; 1837f1af5d2fSBarry Smith x[ir] = t[ii]; 1838f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1839f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1840f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1841f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1842f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1843f1af5d2fSBarry Smith ii += 6; 1844f1af5d2fSBarry Smith } 1845f1af5d2fSBarry Smith 1846f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1847f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 18483649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 18491ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1850dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1851f1af5d2fSBarry Smith PetscFunctionReturn(0); 1852f1af5d2fSBarry Smith } 1853f1af5d2fSBarry Smith 18544a2ae208SSatish Balay #undef __FUNCT__ 18554dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 18564dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 185732121132SShri Abhyankar { 185832121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 185932121132SShri Abhyankar PetscErrorCode ierr; 186032121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 1861b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 186232121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 186332121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1864b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1865b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1866b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1867b3260449SShri Abhyankar const PetscScalar *b; 186832121132SShri Abhyankar 186932121132SShri Abhyankar PetscFunctionBegin; 18703649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 187132121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 187232121132SShri Abhyankar t = a->solve_work; 187332121132SShri Abhyankar 187432121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 187532121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 187632121132SShri Abhyankar 187732121132SShri Abhyankar /* copy b into temp work space according to permutation */ 187832121132SShri Abhyankar for(i=0;i<n;i++){ 187932121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 188032121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 188132121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 188232121132SShri Abhyankar } 188332121132SShri Abhyankar 188432121132SShri Abhyankar /* forward solve the U^T */ 188532121132SShri Abhyankar idx = 0; 188632121132SShri Abhyankar for (i=0; i<n; i++) { 188732121132SShri Abhyankar v = aa + bs2*diag[i]; 188832121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 188932121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 189032121132SShri Abhyankar x6 = t[5+idx]; 189132121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 189232121132SShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 189332121132SShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 189432121132SShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 189532121132SShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 189632121132SShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 189732121132SShri Abhyankar v -= bs2; 189832121132SShri Abhyankar 189932121132SShri Abhyankar vi = aj + diag[i] - 1; 190032121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 190132121132SShri Abhyankar for(j=0;j>-nz;j--){ 190232121132SShri Abhyankar oidx = bs*vi[j]; 190332121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 190432121132SShri Abhyankar t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 190532121132SShri Abhyankar t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 190632121132SShri Abhyankar t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 190732121132SShri Abhyankar t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 190832121132SShri Abhyankar t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 190932121132SShri Abhyankar v -= bs2; 191032121132SShri Abhyankar } 191132121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 191232121132SShri Abhyankar t[5+idx] = s6; 191332121132SShri Abhyankar idx += bs; 191432121132SShri Abhyankar } 191532121132SShri Abhyankar /* backward solve the L^T */ 191632121132SShri Abhyankar for (i=n-1; i>=0; i--){ 191732121132SShri Abhyankar v = aa + bs2*ai[i]; 191832121132SShri Abhyankar vi = aj + ai[i]; 191932121132SShri Abhyankar nz = ai[i+1] - ai[i]; 192032121132SShri Abhyankar idt = bs*i; 192132121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 192232121132SShri Abhyankar s6 = t[5+idt]; 192332121132SShri Abhyankar for(j=0;j<nz;j++){ 192432121132SShri Abhyankar idx = bs*vi[j]; 192532121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 192632121132SShri Abhyankar t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 192732121132SShri Abhyankar t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 192832121132SShri Abhyankar t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 192932121132SShri Abhyankar t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 193032121132SShri Abhyankar t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 193132121132SShri Abhyankar v += bs2; 193232121132SShri Abhyankar } 193332121132SShri Abhyankar } 193432121132SShri Abhyankar 193532121132SShri Abhyankar /* copy t into x according to permutation */ 193632121132SShri Abhyankar for(i=0;i<n;i++){ 193732121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 193832121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 193932121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 194032121132SShri Abhyankar } 194132121132SShri Abhyankar 194232121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 194332121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 19443649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 194532121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 194632121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 194732121132SShri Abhyankar PetscFunctionReturn(0); 194832121132SShri Abhyankar } 194932121132SShri Abhyankar 195032121132SShri Abhyankar #undef __FUNCT__ 195106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 195206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1953f1af5d2fSBarry Smith { 1954f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1955f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 19566849ba73SBarry Smith PetscErrorCode ierr; 19575d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 1958b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1959b3260449SShri Abhyankar PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1960b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 1961b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1962b3260449SShri Abhyankar const PetscScalar *b; 1963f1af5d2fSBarry Smith 1964f1af5d2fSBarry Smith PetscFunctionBegin; 19653649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 19661ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1967f1af5d2fSBarry Smith t = a->solve_work; 1968f1af5d2fSBarry Smith 1969f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1970f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1971f1af5d2fSBarry Smith 1972f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1973f1af5d2fSBarry Smith ii = 0; 1974f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1975f1af5d2fSBarry Smith ic = 7*c[i]; 1976f1af5d2fSBarry Smith t[ii] = b[ic]; 1977f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1978f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1979f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1980f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1981f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1982f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1983f1af5d2fSBarry Smith ii += 7; 1984f1af5d2fSBarry Smith } 1985f1af5d2fSBarry Smith 1986f1af5d2fSBarry Smith /* forward solve the U^T */ 1987f1af5d2fSBarry Smith idx = 0; 1988f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1989f1af5d2fSBarry Smith 1990f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1991f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1992f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1993f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1994f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1995f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1996f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1997f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1998f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1999f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 2000f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 2001f1af5d2fSBarry Smith v += 49; 2002f1af5d2fSBarry Smith 2003f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 2004f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 2005f1af5d2fSBarry Smith while (nz--) { 2006f1af5d2fSBarry Smith oidx = 7*(*vi++); 2007f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2008f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2009f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2010f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2011f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2012f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2013f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2014f1af5d2fSBarry Smith v += 49; 2015f1af5d2fSBarry Smith } 2016f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2017f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 2018f1af5d2fSBarry Smith idx += 7; 2019f1af5d2fSBarry Smith } 2020f1af5d2fSBarry Smith /* backward solve the L^T */ 2021f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 2022f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 2023f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 2024f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 2025f1af5d2fSBarry Smith idt = 7*i; 2026f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2027f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 2028f1af5d2fSBarry Smith while (nz--) { 2029f1af5d2fSBarry Smith idx = 7*(*vi--); 2030f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2031f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2032f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2033f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2034f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2035f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2036f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2037f1af5d2fSBarry Smith v -= 49; 2038f1af5d2fSBarry Smith } 2039f1af5d2fSBarry Smith } 2040f1af5d2fSBarry Smith 2041f1af5d2fSBarry Smith /* copy t into x according to permutation */ 2042f1af5d2fSBarry Smith ii = 0; 2043f1af5d2fSBarry Smith for (i=0; i<n; i++) { 2044f1af5d2fSBarry Smith ir = 7*r[i]; 2045f1af5d2fSBarry Smith x[ir] = t[ii]; 2046f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 2047f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 2048f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 2049f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 2050f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 2051f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 2052f1af5d2fSBarry Smith ii += 7; 2053f1af5d2fSBarry Smith } 2054f1af5d2fSBarry Smith 2055f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2056f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 20573649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 20581ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2059dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2060f1af5d2fSBarry Smith PetscFunctionReturn(0); 2061f1af5d2fSBarry Smith } 206232121132SShri Abhyankar #undef __FUNCT__ 20634dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 20644dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 206532121132SShri Abhyankar { 206632121132SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 206732121132SShri Abhyankar PetscErrorCode ierr; 206832121132SShri Abhyankar IS iscol=a->col,isrow=a->row; 2069b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 207032121132SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 207132121132SShri Abhyankar PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 2072b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2073b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2074b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2075b3260449SShri Abhyankar const PetscScalar *b; 207632121132SShri Abhyankar 207732121132SShri Abhyankar PetscFunctionBegin; 20783649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 207932121132SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 208032121132SShri Abhyankar t = a->solve_work; 208132121132SShri Abhyankar 208232121132SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 208332121132SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 208432121132SShri Abhyankar 208532121132SShri Abhyankar /* copy b into temp work space according to permutation */ 208632121132SShri Abhyankar for(i=0;i<n;i++){ 208732121132SShri Abhyankar ii = bs*i; ic = bs*c[i]; 208832121132SShri Abhyankar t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 208932121132SShri Abhyankar t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 209032121132SShri Abhyankar } 209132121132SShri Abhyankar 209232121132SShri Abhyankar /* forward solve the U^T */ 209332121132SShri Abhyankar idx = 0; 209432121132SShri Abhyankar for (i=0; i<n; i++) { 209532121132SShri Abhyankar v = aa + bs2*diag[i]; 209632121132SShri Abhyankar /* multiply by the inverse of the block diagonal */ 209732121132SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 209832121132SShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 209932121132SShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 210032121132SShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 210132121132SShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 210232121132SShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 210332121132SShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 210432121132SShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 210532121132SShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 210632121132SShri Abhyankar v -= bs2; 210732121132SShri Abhyankar 210832121132SShri Abhyankar vi = aj + diag[i] - 1; 210932121132SShri Abhyankar nz = diag[i] - diag[i+1] - 1; 211032121132SShri Abhyankar for(j=0;j>-nz;j--){ 211132121132SShri Abhyankar oidx = bs*vi[j]; 211232121132SShri Abhyankar t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 211332121132SShri Abhyankar t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 211432121132SShri Abhyankar t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 211532121132SShri Abhyankar t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 211632121132SShri Abhyankar t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 211732121132SShri Abhyankar t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 211832121132SShri Abhyankar t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 211932121132SShri Abhyankar v -= bs2; 212032121132SShri Abhyankar } 212132121132SShri Abhyankar t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 212232121132SShri Abhyankar t[5+idx] = s6; t[6+idx] = s7; 212332121132SShri Abhyankar idx += bs; 212432121132SShri Abhyankar } 212532121132SShri Abhyankar /* backward solve the L^T */ 212632121132SShri Abhyankar for (i=n-1; i>=0; i--){ 212732121132SShri Abhyankar v = aa + bs2*ai[i]; 212832121132SShri Abhyankar vi = aj + ai[i]; 212932121132SShri Abhyankar nz = ai[i+1] - ai[i]; 213032121132SShri Abhyankar idt = bs*i; 213132121132SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 213232121132SShri Abhyankar s6 = t[5+idt]; s7 = t[6+idt]; 213332121132SShri Abhyankar for(j=0;j<nz;j++){ 213432121132SShri Abhyankar idx = bs*vi[j]; 213532121132SShri Abhyankar t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 213632121132SShri Abhyankar t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 213732121132SShri Abhyankar t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 213832121132SShri Abhyankar t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 213932121132SShri Abhyankar t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 214032121132SShri Abhyankar t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 214132121132SShri Abhyankar t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 214232121132SShri Abhyankar v += bs2; 214332121132SShri Abhyankar } 214432121132SShri Abhyankar } 214532121132SShri Abhyankar 214632121132SShri Abhyankar /* copy t into x according to permutation */ 214732121132SShri Abhyankar for(i=0;i<n;i++){ 214832121132SShri Abhyankar ii = bs*i; ir = bs*r[i]; 214932121132SShri Abhyankar x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 215032121132SShri Abhyankar x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 215132121132SShri Abhyankar } 215232121132SShri Abhyankar 215332121132SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 215432121132SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 21553649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 215632121132SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 215732121132SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 215832121132SShri Abhyankar PetscFunctionReturn(0); 215932121132SShri Abhyankar } 2160f1af5d2fSBarry Smith 21614e2b4712SSatish Balay /* ----------------------------------------------------------- */ 21624a2ae208SSatish Balay #undef __FUNCT__ 216306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 216406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 21654e2b4712SSatish Balay { 21664e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 21674e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 21686849ba73SBarry Smith PetscErrorCode ierr; 2169b3260449SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 2170b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 2171b3260449SShri Abhyankar PetscInt i,nz; 2172b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2173b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2174b3260449SShri Abhyankar PetscScalar *x,*s,*t,*ls; 2175b3260449SShri Abhyankar const PetscScalar *b; 21764e2b4712SSatish Balay 21774e2b4712SSatish Balay PetscFunctionBegin; 21783649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 21791ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2180f1af5d2fSBarry Smith t = a->solve_work; 21814e2b4712SSatish Balay 21824e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 21834e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 21844e2b4712SSatish Balay 21854e2b4712SSatish Balay /* forward solve the lower triangular */ 218687828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 21874e2b4712SSatish Balay for (i=1; i<n; i++) { 21884e2b4712SSatish Balay v = aa + bs2*ai[i]; 21894e2b4712SSatish Balay vi = aj + ai[i]; 21904e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 2191f1af5d2fSBarry Smith s = t + bs*i; 219287828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 21934e2b4712SSatish Balay while (nz--) { 2194f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 21954e2b4712SSatish Balay v += bs2; 21964e2b4712SSatish Balay } 21974e2b4712SSatish Balay } 21984e2b4712SSatish Balay /* backward solve the upper triangular */ 2199d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 22004e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 22014e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 22024e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 22034e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 220487828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 22054e2b4712SSatish Balay while (nz--) { 2206f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 22074e2b4712SSatish Balay v += bs2; 22084e2b4712SSatish Balay } 2209f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 221087828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 22114e2b4712SSatish Balay } 22124e2b4712SSatish Balay 22134e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22144e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 22153649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 22161ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2217dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 22184e2b4712SSatish Balay PetscFunctionReturn(0); 22194e2b4712SSatish Balay } 22204e2b4712SSatish Balay 22215c42ef9dSBarry Smith /* ----------------------------------------------------------- */ 22225c42ef9dSBarry Smith #undef __FUNCT__ 222306e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 222406e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 22255c42ef9dSBarry Smith { 22265c42ef9dSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 22275c42ef9dSBarry Smith IS iscol=a->col,isrow=a->row; 22285c42ef9dSBarry Smith PetscErrorCode ierr; 22295c42ef9dSBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2230b3260449SShri Abhyankar PetscInt i,nz,j; 2231b3260449SShri Abhyankar const PetscInt n=a->mbs,bs=A->rmap->bs,bs2=a->bs2; 22325c42ef9dSBarry Smith const MatScalar *aa=a->a,*v; 22335c42ef9dSBarry Smith PetscScalar *x,*t,*ls; 22345c42ef9dSBarry Smith const PetscScalar *b; 22355c42ef9dSBarry Smith PetscFunctionBegin; 22363649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 22375c42ef9dSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22385c42ef9dSBarry Smith t = a->solve_work; 22395c42ef9dSBarry Smith 22405c42ef9dSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 22415c42ef9dSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 22425c42ef9dSBarry Smith 22435c42ef9dSBarry Smith /* copy the b into temp work space according to permutation */ 22445c42ef9dSBarry Smith for (i=0; i<n; i++) { 22455c42ef9dSBarry Smith for (j=0; j<bs; j++) { 22465c42ef9dSBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 22475c42ef9dSBarry Smith } 22485c42ef9dSBarry Smith } 22495c42ef9dSBarry Smith 22505c42ef9dSBarry Smith 22515c42ef9dSBarry Smith /* forward solve the upper triangular transpose */ 22525c42ef9dSBarry Smith ls = a->solve_work + A->cmap->n; 22535c42ef9dSBarry Smith for (i=0; i<n; i++){ 22545c42ef9dSBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 22555c42ef9dSBarry Smith Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 22565c42ef9dSBarry Smith v = aa + bs2*(a->diag[i] + 1); 22575c42ef9dSBarry Smith vi = aj + a->diag[i] + 1; 22585c42ef9dSBarry Smith nz = ai[i+1] - a->diag[i] - 1; 22595c42ef9dSBarry Smith while (nz--) { 22605c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 22615c42ef9dSBarry Smith v += bs2; 22625c42ef9dSBarry Smith } 22635c42ef9dSBarry Smith } 22645c42ef9dSBarry Smith 22655c42ef9dSBarry Smith /* backward solve the lower triangular transpose */ 22665c42ef9dSBarry Smith for (i=n-1; i>=0; i--) { 22675c42ef9dSBarry Smith v = aa + bs2*ai[i]; 22685c42ef9dSBarry Smith vi = aj + ai[i]; 22695c42ef9dSBarry Smith nz = a->diag[i] - ai[i]; 22705c42ef9dSBarry Smith while (nz--) { 22715c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 22725c42ef9dSBarry Smith v += bs2; 22735c42ef9dSBarry Smith } 22745c42ef9dSBarry Smith } 22755c42ef9dSBarry Smith 22765c42ef9dSBarry Smith /* copy t into x according to permutation */ 22775c42ef9dSBarry Smith for (i=0; i<n; i++) { 22785c42ef9dSBarry Smith for (j=0; j<bs; j++) { 22795c42ef9dSBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 22805c42ef9dSBarry Smith } 22815c42ef9dSBarry Smith } 22825c42ef9dSBarry Smith 22835c42ef9dSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22845c42ef9dSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 22853649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 22865c42ef9dSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 22875c42ef9dSBarry Smith ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 22885c42ef9dSBarry Smith PetscFunctionReturn(0); 22895c42ef9dSBarry Smith } 22905c42ef9dSBarry Smith 22914a2ae208SSatish Balay #undef __FUNCT__ 22924dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 22934dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 22948499736aSShri Abhyankar { 22958499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 22968499736aSShri Abhyankar IS iscol=a->col,isrow=a->row; 22978499736aSShri Abhyankar PetscErrorCode ierr; 2298b3260449SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 2299b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2300b3260449SShri Abhyankar PetscInt i,j,nz; 2301b3260449SShri Abhyankar const PetscInt bs=A->rmap->bs,bs2=a->bs2; 23028499736aSShri Abhyankar const MatScalar *aa=a->a,*v; 23038499736aSShri Abhyankar PetscScalar *x,*t,*ls; 23048499736aSShri Abhyankar const PetscScalar *b; 2305b3260449SShri Abhyankar 23068499736aSShri Abhyankar PetscFunctionBegin; 23073649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 23088499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23098499736aSShri Abhyankar t = a->solve_work; 23108499736aSShri Abhyankar 23118499736aSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 23128499736aSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 23138499736aSShri Abhyankar 23148499736aSShri Abhyankar /* copy the b into temp work space according to permutation */ 23158499736aSShri Abhyankar for (i=0; i<n; i++) { 23168499736aSShri Abhyankar for (j=0; j<bs; j++) { 23178499736aSShri Abhyankar t[i*bs+j] = b[c[i]*bs+j]; 23188499736aSShri Abhyankar } 23198499736aSShri Abhyankar } 23208499736aSShri Abhyankar 23218499736aSShri Abhyankar 23228499736aSShri Abhyankar /* forward solve the upper triangular transpose */ 23238499736aSShri Abhyankar ls = a->solve_work + A->cmap->n; 23248499736aSShri Abhyankar for (i=0; i<n; i++){ 23258499736aSShri Abhyankar ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 23268499736aSShri Abhyankar Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 23278499736aSShri Abhyankar v = aa + bs2*(diag[i] - 1); 23288499736aSShri Abhyankar vi = aj + diag[i] - 1; 23298499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 23308499736aSShri Abhyankar for(j=0;j>-nz;j--){ 23318499736aSShri Abhyankar Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 23328499736aSShri Abhyankar v -= bs2; 23338499736aSShri Abhyankar } 23348499736aSShri Abhyankar } 23358499736aSShri Abhyankar 23368499736aSShri Abhyankar /* backward solve the lower triangular transpose */ 23378499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 23388499736aSShri Abhyankar v = aa + bs2*ai[i]; 23398499736aSShri Abhyankar vi = aj + ai[i]; 23408499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 23418499736aSShri Abhyankar for(j=0;j<nz;j++){ 23428499736aSShri Abhyankar Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 23438499736aSShri Abhyankar v += bs2; 23448499736aSShri Abhyankar } 23458499736aSShri Abhyankar } 23468499736aSShri Abhyankar 23478499736aSShri Abhyankar /* copy t into x according to permutation */ 23488499736aSShri Abhyankar for (i=0; i<n; i++) { 23498499736aSShri Abhyankar for (j=0; j<bs; j++) { 23508499736aSShri Abhyankar x[bs*r[i]+j] = t[bs*i+j]; 23518499736aSShri Abhyankar } 23528499736aSShri Abhyankar } 23538499736aSShri Abhyankar 23548499736aSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 23558499736aSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 23563649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 23578499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 23588499736aSShri Abhyankar ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 23598499736aSShri Abhyankar PetscFunctionReturn(0); 23608499736aSShri Abhyankar } 23618499736aSShri Abhyankar 2362832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns of the block at once */ 236329a97285SShri Abhyankar 23642b0b2ea7SShri Abhyankar #undef __FUNCT__ 2365832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2" 2366832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx) 23672b0b2ea7SShri Abhyankar { 23682b0b2ea7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 23692b0b2ea7SShri Abhyankar PetscErrorCode ierr; 2370b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 23710fa040f9SShri Abhyankar PetscInt i,nz,idx,idt,m; 23720b68f018SBarry Smith const MatScalar *aa=a->a,*v; 23732b0b2ea7SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 23742b0b2ea7SShri Abhyankar PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 23750fa040f9SShri Abhyankar PetscScalar *x; 23760b68f018SBarry Smith const PetscScalar *b; 23772b0b2ea7SShri Abhyankar 23782b0b2ea7SShri Abhyankar PetscFunctionBegin; 23793649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 23802b0b2ea7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23812b0b2ea7SShri Abhyankar 23822b0b2ea7SShri Abhyankar /* forward solve the lower triangular */ 238329a97285SShri Abhyankar idx = 0; 23840fa040f9SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx]; x[4] = b[4+idx]; 23850fa040f9SShri Abhyankar x[5] = b[5+idx]; x[6] = b[6+idx]; x[7] = b[7+idx]; x[8] = b[8+idx]; x[9] = b[9+idx]; 23860fa040f9SShri Abhyankar x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx]; 23872b0b2ea7SShri Abhyankar 23882b0b2ea7SShri Abhyankar for (i=1; i<n; i++) { 23892b0b2ea7SShri Abhyankar v = aa + bs2*ai[i]; 23902b0b2ea7SShri Abhyankar vi = aj + ai[i]; 23912b0b2ea7SShri Abhyankar nz = ai[i+1] - ai[i]; 23920fa040f9SShri Abhyankar idt = bs*i; 23930fa040f9SShri Abhyankar s1 = b[idt]; s2 = b[1+idt]; s3 = b[2+idt]; s4 = b[3+idt]; s5 = b[4+idt]; 23940fa040f9SShri Abhyankar s6 = b[5+idt]; s7 = b[6+idt]; s8 = b[7+idt]; s9 = b[8+idt]; s10 = b[9+idt]; 23950fa040f9SShri Abhyankar s11 = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt]; 23962b0b2ea7SShri Abhyankar for(m=0;m<nz;m++){ 23972b0b2ea7SShri Abhyankar idx = bs*vi[m]; 23980fa040f9SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 23990fa040f9SShri Abhyankar x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 24000fa040f9SShri Abhyankar x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 24012b0b2ea7SShri Abhyankar 24020b8f6341SShri Abhyankar 24032b0b2ea7SShri Abhyankar s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 24042b0b2ea7SShri Abhyankar s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 24052b0b2ea7SShri Abhyankar s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 24062b0b2ea7SShri Abhyankar s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 24072b0b2ea7SShri Abhyankar s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 24082b0b2ea7SShri Abhyankar s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 24092b0b2ea7SShri Abhyankar s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 24102b0b2ea7SShri Abhyankar s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 24112b0b2ea7SShri Abhyankar s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 24122b0b2ea7SShri Abhyankar s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 24132b0b2ea7SShri Abhyankar s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 24142b0b2ea7SShri Abhyankar s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 24152b0b2ea7SShri Abhyankar s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 24162b0b2ea7SShri Abhyankar s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 24172b0b2ea7SShri Abhyankar s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 24182b0b2ea7SShri Abhyankar 24192b0b2ea7SShri Abhyankar v += bs2; 24202b0b2ea7SShri Abhyankar } 24210fa040f9SShri Abhyankar x[idt] = s1; x[1+idt] = s2; x[2+idt] = s3; x[3+idt] = s4; x[4+idt] = s5; 24220fa040f9SShri Abhyankar x[5+idt] = s6; x[6+idt] = s7; x[7+idt] = s8; x[8+idt] = s9; x[9+idt] = s10; 24230fa040f9SShri Abhyankar x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15; 24242b0b2ea7SShri Abhyankar 24252b0b2ea7SShri Abhyankar } 24262b0b2ea7SShri Abhyankar /* backward solve the upper triangular */ 24272b0b2ea7SShri Abhyankar for (i=n-1; i>=0; i--){ 24282b0b2ea7SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 24292b0b2ea7SShri Abhyankar vi = aj + adiag[i+1]+1; 24302b0b2ea7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 24312b0b2ea7SShri Abhyankar idt = bs*i; 24320fa040f9SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 24330fa040f9SShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; s8 = x[7+idt]; s9 = x[8+idt]; s10 = x[9+idt]; 24340fa040f9SShri Abhyankar s11 = x[10+idt]; s12 = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt]; 24352b0b2ea7SShri Abhyankar 24362b0b2ea7SShri Abhyankar for(m=0;m<nz;m++){ 24372b0b2ea7SShri Abhyankar idx = bs*vi[m]; 24380fa040f9SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 24390fa040f9SShri Abhyankar x6 = x[5+idx]; x7 = x[6+idx]; x8 = x[7+idx]; x9 = x[8+idx]; x10 = x[9+idx]; 24400fa040f9SShri Abhyankar x11 = x[10+idx]; x12 = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx]; 24412b0b2ea7SShri Abhyankar 24422b0b2ea7SShri Abhyankar s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 24432b0b2ea7SShri Abhyankar s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 24442b0b2ea7SShri Abhyankar s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 24452b0b2ea7SShri Abhyankar s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 24462b0b2ea7SShri Abhyankar s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 24472b0b2ea7SShri Abhyankar s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 24482b0b2ea7SShri Abhyankar s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 24492b0b2ea7SShri Abhyankar s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 24502b0b2ea7SShri Abhyankar s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 24512b0b2ea7SShri Abhyankar s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 24522b0b2ea7SShri Abhyankar s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 24532b0b2ea7SShri Abhyankar s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 24542b0b2ea7SShri Abhyankar s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 24552b0b2ea7SShri Abhyankar s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 24562b0b2ea7SShri Abhyankar s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 24572b0b2ea7SShri Abhyankar 24582b0b2ea7SShri Abhyankar v += bs2; 24592b0b2ea7SShri Abhyankar } 24602b0b2ea7SShri Abhyankar 24610fa040f9SShri Abhyankar x[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 24620fa040f9SShri Abhyankar x[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 24630fa040f9SShri Abhyankar x[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 24640fa040f9SShri Abhyankar x[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 24650fa040f9SShri Abhyankar x[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 24660fa040f9SShri Abhyankar x[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 24670fa040f9SShri Abhyankar x[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 24680fa040f9SShri Abhyankar x[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 24690fa040f9SShri Abhyankar x[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 24700fa040f9SShri Abhyankar x[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 24710fa040f9SShri Abhyankar x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 24720fa040f9SShri Abhyankar x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 24730fa040f9SShri Abhyankar x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 24740fa040f9SShri Abhyankar x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 24750fa040f9SShri Abhyankar x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 24762b0b2ea7SShri Abhyankar 24772b0b2ea7SShri Abhyankar } 24782b0b2ea7SShri Abhyankar 24793649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 24802b0b2ea7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 24812b0b2ea7SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 24822b0b2ea7SShri Abhyankar PetscFunctionReturn(0); 24832b0b2ea7SShri Abhyankar } 24842b0b2ea7SShri Abhyankar 2485832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */ 2486832cc040SShri Abhyankar /* Default MatSolve for block size 15 */ 2487832cc040SShri Abhyankar 24888499736aSShri Abhyankar #undef __FUNCT__ 2489832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1" 2490832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx) 24910b8f6341SShri Abhyankar { 24920b8f6341SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 24930b8f6341SShri Abhyankar PetscErrorCode ierr; 24940b8f6341SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 249553ef36baSBarry Smith PetscInt i,k,nz,idx,idt,m; 24960b8f6341SShri Abhyankar const MatScalar *aa=a->a,*v; 24970b8f6341SShri Abhyankar PetscScalar s[15]; 249853ef36baSBarry Smith PetscScalar *x,xv; 24990b8f6341SShri Abhyankar const PetscScalar *b; 25000b8f6341SShri Abhyankar 25010b8f6341SShri Abhyankar PetscFunctionBegin; 25023649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 25030b8f6341SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25040b8f6341SShri Abhyankar 25050b8f6341SShri Abhyankar /* forward solve the lower triangular */ 2506832cc040SShri Abhyankar for (i=0; i<n; i++) { 25070b8f6341SShri Abhyankar v = aa + bs2*ai[i]; 25080b8f6341SShri Abhyankar vi = aj + ai[i]; 25090b8f6341SShri Abhyankar nz = ai[i+1] - ai[i]; 25100fa040f9SShri Abhyankar idt = bs*i; 2511832cc040SShri Abhyankar x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 2512832cc040SShri Abhyankar x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 2513832cc040SShri Abhyankar x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt]; 25140b8f6341SShri Abhyankar for(m=0;m<nz;m++){ 25150b8f6341SShri Abhyankar idx = bs*vi[m]; 25160b8f6341SShri Abhyankar for(k=0;k<15;k++){ 251753ef36baSBarry Smith xv = x[k + idx]; 251853ef36baSBarry Smith x[idt] -= v[0]*xv; 251953ef36baSBarry Smith x[1+idt] -= v[1]*xv; 252053ef36baSBarry Smith x[2+idt] -= v[2]*xv; 252153ef36baSBarry Smith x[3+idt] -= v[3]*xv; 252253ef36baSBarry Smith x[4+idt] -= v[4]*xv; 252353ef36baSBarry Smith x[5+idt] -= v[5]*xv; 252453ef36baSBarry Smith x[6+idt] -= v[6]*xv; 252553ef36baSBarry Smith x[7+idt] -= v[7]*xv; 252653ef36baSBarry Smith x[8+idt] -= v[8]*xv; 252753ef36baSBarry Smith x[9+idt] -= v[9]*xv; 252853ef36baSBarry Smith x[10+idt] -= v[10]*xv; 252953ef36baSBarry Smith x[11+idt] -= v[11]*xv; 253053ef36baSBarry Smith x[12+idt] -= v[12]*xv; 253153ef36baSBarry Smith x[13+idt] -= v[13]*xv; 253253ef36baSBarry Smith x[14+idt] -= v[14]*xv; 25330b8f6341SShri Abhyankar v += 15; 25340b8f6341SShri Abhyankar } 25350b8f6341SShri Abhyankar } 25360b8f6341SShri Abhyankar } 25370b8f6341SShri Abhyankar /* backward solve the upper triangular */ 25380b8f6341SShri Abhyankar for (i=n-1; i>=0; i--){ 25390b8f6341SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 25400b8f6341SShri Abhyankar vi = aj + adiag[i+1]+1; 25410b8f6341SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 25420b8f6341SShri Abhyankar idt = bs*i; 25430fa040f9SShri Abhyankar s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 25440fa040f9SShri Abhyankar s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 25450fa040f9SShri Abhyankar s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt]; 25460b8f6341SShri Abhyankar 25470b8f6341SShri Abhyankar for(m=0;m<nz;m++){ 25480b8f6341SShri Abhyankar idx = bs*vi[m]; 25490b8f6341SShri Abhyankar for(k=0;k<15;k++){ 255053ef36baSBarry Smith xv = x[k + idx]; 255153ef36baSBarry Smith s[0] -= v[0]*xv; 255253ef36baSBarry Smith s[1] -= v[1]*xv; 255353ef36baSBarry Smith s[2] -= v[2]*xv; 255453ef36baSBarry Smith s[3] -= v[3]*xv; 255553ef36baSBarry Smith s[4] -= v[4]*xv; 255653ef36baSBarry Smith s[5] -= v[5]*xv; 255753ef36baSBarry Smith s[6] -= v[6]*xv; 255853ef36baSBarry Smith s[7] -= v[7]*xv; 255953ef36baSBarry Smith s[8] -= v[8]*xv; 256053ef36baSBarry Smith s[9] -= v[9]*xv; 256153ef36baSBarry Smith s[10] -= v[10]*xv; 256253ef36baSBarry Smith s[11] -= v[11]*xv; 256353ef36baSBarry Smith s[12] -= v[12]*xv; 256453ef36baSBarry Smith s[13] -= v[13]*xv; 256553ef36baSBarry Smith s[14] -= v[14]*xv; 25660b8f6341SShri Abhyankar v += 15; 25670b8f6341SShri Abhyankar } 25680b8f6341SShri Abhyankar } 25690fa040f9SShri Abhyankar ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr); 25700b8f6341SShri Abhyankar for(k=0;k<15;k++){ 25710fa040f9SShri Abhyankar x[idt] += v[0]*s[k]; 25720fa040f9SShri Abhyankar x[1+idt] += v[1]*s[k]; 25730fa040f9SShri Abhyankar x[2+idt] += v[2]*s[k]; 25740fa040f9SShri Abhyankar x[3+idt] += v[3]*s[k]; 25750fa040f9SShri Abhyankar x[4+idt] += v[4]*s[k]; 25760fa040f9SShri Abhyankar x[5+idt] += v[5]*s[k]; 25770fa040f9SShri Abhyankar x[6+idt] += v[6]*s[k]; 25780fa040f9SShri Abhyankar x[7+idt] += v[7]*s[k]; 25790fa040f9SShri Abhyankar x[8+idt] += v[8]*s[k]; 25800fa040f9SShri Abhyankar x[9+idt] += v[9]*s[k]; 25810fa040f9SShri Abhyankar x[10+idt] += v[10]*s[k]; 25820fa040f9SShri Abhyankar x[11+idt] += v[11]*s[k]; 25830fa040f9SShri Abhyankar x[12+idt] += v[12]*s[k]; 25840fa040f9SShri Abhyankar x[13+idt] += v[13]*s[k]; 25850fa040f9SShri Abhyankar x[14+idt] += v[14]*s[k]; 25860b8f6341SShri Abhyankar v += 15; 25870b8f6341SShri Abhyankar } 25880b8f6341SShri Abhyankar } 25893649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 25900b8f6341SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 25910b8f6341SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 25920b8f6341SShri Abhyankar PetscFunctionReturn(0); 25930b8f6341SShri Abhyankar } 25940b8f6341SShri Abhyankar 25950b8f6341SShri Abhyankar 25960b8f6341SShri Abhyankar #undef __FUNCT__ 259706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 259806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 25994e2b4712SSatish Balay { 26004e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 26014e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 26026849ba73SBarry Smith PetscErrorCode ierr; 2603b3260449SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j; 2604b3260449SShri Abhyankar const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 2605b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 2606b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2607b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2608b3260449SShri Abhyankar const PetscScalar *b; 26094e2b4712SSatish Balay 26104e2b4712SSatish Balay PetscFunctionBegin; 26113649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 26121ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2613f1af5d2fSBarry Smith t = a->solve_work; 26144e2b4712SSatish Balay 26154e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 26164e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 26174e2b4712SSatish Balay 26184e2b4712SSatish Balay /* forward solve the lower triangular */ 26194e2b4712SSatish Balay idx = 7*(*r++); 2620f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2621f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2622f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 26234e2b4712SSatish Balay 26244e2b4712SSatish Balay for (i=1; i<n; i++) { 26254e2b4712SSatish Balay v = aa + 49*ai[i]; 26264e2b4712SSatish Balay vi = aj + ai[i]; 26274e2b4712SSatish Balay nz = diag[i] - ai[i]; 26284e2b4712SSatish Balay idx = 7*(*r++); 2629f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2630f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 26314e2b4712SSatish Balay while (nz--) { 26324e2b4712SSatish Balay idx = 7*(*vi++); 2633f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2634f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2635f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 2636f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2637f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2638f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2639f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2640f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2641f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2642f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 26434e2b4712SSatish Balay v += 49; 26444e2b4712SSatish Balay } 26454e2b4712SSatish Balay idx = 7*i; 2646f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2647f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2648f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 26494e2b4712SSatish Balay } 26504e2b4712SSatish Balay /* backward solve the upper triangular */ 26514e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 26524e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 26534e2b4712SSatish Balay vi = aj + diag[i] + 1; 26544e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 26554e2b4712SSatish Balay idt = 7*i; 2656f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2657f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2658f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 26594e2b4712SSatish Balay while (nz--) { 26604e2b4712SSatish Balay idx = 7*(*vi++); 2661f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2662f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2663f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 2664f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2665f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2666f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2667f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2668f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2669f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2670f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 26714e2b4712SSatish Balay v += 49; 26724e2b4712SSatish Balay } 26734e2b4712SSatish Balay idc = 7*(*c--); 26744e2b4712SSatish Balay v = aa + 49*diag[i]; 2675f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2676f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2677f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2678f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2679f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2680f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2681f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2682f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2683f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2684f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2685f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2686f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2687f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2688f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 26894e2b4712SSatish Balay } 26904e2b4712SSatish Balay 26914e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 26924e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 26933649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 26941ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2695dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 26964e2b4712SSatish Balay PetscFunctionReturn(0); 26974e2b4712SSatish Balay } 26984e2b4712SSatish Balay 26998f690400SShri Abhyankar #undef __FUNCT__ 27004dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7" 27014dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 270235aa4fcfSShri Abhyankar { 270335aa4fcfSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 270435aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 270535aa4fcfSShri Abhyankar PetscErrorCode ierr; 2706b3260449SShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 2707b3260449SShri Abhyankar const PetscInt n=a->mbs,*rout,*cout,*vi; 2708b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 2709b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 2710b3260449SShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2711b3260449SShri Abhyankar const PetscScalar *b; 271235aa4fcfSShri Abhyankar 271335aa4fcfSShri Abhyankar PetscFunctionBegin; 27143649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 271535aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 271635aa4fcfSShri Abhyankar t = a->solve_work; 271735aa4fcfSShri Abhyankar 271835aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 271935aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 272035aa4fcfSShri Abhyankar 272135aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 272235aa4fcfSShri Abhyankar idx = 7*r[0]; 272335aa4fcfSShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 272435aa4fcfSShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 272535aa4fcfSShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 272635aa4fcfSShri Abhyankar 272735aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 272835aa4fcfSShri Abhyankar v = aa + 49*ai[i]; 272935aa4fcfSShri Abhyankar vi = aj + ai[i]; 273035aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 273135aa4fcfSShri Abhyankar idx = 7*r[i]; 273235aa4fcfSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 273335aa4fcfSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 273435aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 273535aa4fcfSShri Abhyankar idx = 7*vi[m]; 273635aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 273735aa4fcfSShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 273835aa4fcfSShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 273935aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 274035aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 274135aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 274235aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 274335aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 274435aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 274535aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 274635aa4fcfSShri Abhyankar v += 49; 274735aa4fcfSShri Abhyankar } 274835aa4fcfSShri Abhyankar idx = 7*i; 274935aa4fcfSShri Abhyankar t[idx] = s1;t[1+idx] = s2; 275035aa4fcfSShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 275135aa4fcfSShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 275235aa4fcfSShri Abhyankar } 275335aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 275435aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 275535aa4fcfSShri Abhyankar v = aa + 49*(adiag[i+1]+1); 275635aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 275735aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 275835aa4fcfSShri Abhyankar idt = 7*i; 275935aa4fcfSShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 276035aa4fcfSShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 276135aa4fcfSShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 276235aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 276335aa4fcfSShri Abhyankar idx = 7*vi[m]; 276435aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 276535aa4fcfSShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 276635aa4fcfSShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 276735aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 276835aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 276935aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 277035aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 277135aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 277235aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 277335aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 277435aa4fcfSShri Abhyankar v += 49; 277535aa4fcfSShri Abhyankar } 277635aa4fcfSShri Abhyankar idc = 7*c[i]; 277735aa4fcfSShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 277835aa4fcfSShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 277935aa4fcfSShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 278035aa4fcfSShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 278135aa4fcfSShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 278235aa4fcfSShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 278335aa4fcfSShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 278435aa4fcfSShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 278535aa4fcfSShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 278635aa4fcfSShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 278735aa4fcfSShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 278835aa4fcfSShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 278935aa4fcfSShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 279035aa4fcfSShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 279135aa4fcfSShri Abhyankar } 279235aa4fcfSShri Abhyankar 279335aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 279435aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 27953649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 279635aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 279735aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 279835aa4fcfSShri Abhyankar PetscFunctionReturn(0); 279935aa4fcfSShri Abhyankar } 280035aa4fcfSShri Abhyankar 280135aa4fcfSShri Abhyankar #undef __FUNCT__ 280206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 280306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 280415091d37SBarry Smith { 280515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2806b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2807dfbe8321SBarry Smith PetscErrorCode ierr; 2808b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 2809d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2810d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2811d9fead3dSBarry Smith const PetscScalar *b; 281215091d37SBarry Smith 281315091d37SBarry Smith PetscFunctionBegin; 28143649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 28151ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 281615091d37SBarry Smith /* forward solve the lower triangular */ 281715091d37SBarry Smith idx = 0; 281815091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 281915091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 282015091d37SBarry Smith x[6] = b[6+idx]; 282115091d37SBarry Smith for (i=1; i<n; i++) { 282215091d37SBarry Smith v = aa + 49*ai[i]; 282315091d37SBarry Smith vi = aj + ai[i]; 282415091d37SBarry Smith nz = diag[i] - ai[i]; 282515091d37SBarry Smith idx = 7*i; 2826f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2827f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2828f1af5d2fSBarry Smith s7 = b[6+idx]; 282915091d37SBarry Smith while (nz--) { 283015091d37SBarry Smith jdx = 7*(*vi++); 283115091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 283215091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 283315091d37SBarry Smith x7 = x[6+jdx]; 2834f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2835f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2836f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2837f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2838f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2839f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2840f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 284115091d37SBarry Smith v += 49; 284215091d37SBarry Smith } 2843f1af5d2fSBarry Smith x[idx] = s1; 2844f1af5d2fSBarry Smith x[1+idx] = s2; 2845f1af5d2fSBarry Smith x[2+idx] = s3; 2846f1af5d2fSBarry Smith x[3+idx] = s4; 2847f1af5d2fSBarry Smith x[4+idx] = s5; 2848f1af5d2fSBarry Smith x[5+idx] = s6; 2849f1af5d2fSBarry Smith x[6+idx] = s7; 285015091d37SBarry Smith } 285115091d37SBarry Smith /* backward solve the upper triangular */ 285215091d37SBarry Smith for (i=n-1; i>=0; i--){ 285315091d37SBarry Smith v = aa + 49*diag[i] + 49; 285415091d37SBarry Smith vi = aj + diag[i] + 1; 285515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 285615091d37SBarry Smith idt = 7*i; 2857f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2858f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 2859f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 2860f1af5d2fSBarry Smith s7 = x[6+idt]; 286115091d37SBarry Smith while (nz--) { 286215091d37SBarry Smith idx = 7*(*vi++); 286315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 286415091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 286515091d37SBarry Smith x7 = x[6+idx]; 2866f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2867f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2868f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2869f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2870f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2871f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2872f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 287315091d37SBarry Smith v += 49; 287415091d37SBarry Smith } 287515091d37SBarry Smith v = aa + 49*diag[i]; 2876f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2877f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 2878f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2879f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 2880f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2881f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 2882f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2883f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 2884f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2885f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 2886f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2887f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 2888f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2889f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 289015091d37SBarry Smith } 289115091d37SBarry Smith 28923649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 28931ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2894dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 289515091d37SBarry Smith PetscFunctionReturn(0); 289615091d37SBarry Smith } 289715091d37SBarry Smith 2898cee9d6f2SShri Abhyankar #undef __FUNCT__ 28994dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 29004dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 290153cca76cSShri Abhyankar { 290253cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2903b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 290453cca76cSShri Abhyankar PetscErrorCode ierr; 2905b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 2906b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 290753cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 290853cca76cSShri Abhyankar PetscScalar *x; 290953cca76cSShri Abhyankar const PetscScalar *b; 291053cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 291153cca76cSShri Abhyankar 291253cca76cSShri Abhyankar PetscFunctionBegin; 29133649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 291453cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 291553cca76cSShri Abhyankar /* forward solve the lower triangular */ 291653cca76cSShri Abhyankar idx = 0; 291753cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 291853cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 291953cca76cSShri Abhyankar for (i=1; i<n; i++) { 292053cca76cSShri Abhyankar v = aa + bs2*ai[i]; 292153cca76cSShri Abhyankar vi = aj + ai[i]; 292253cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 292353cca76cSShri Abhyankar idx = bs*i; 292453cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 292553cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 292653cca76cSShri Abhyankar for(k=0;k<nz;k++) { 292753cca76cSShri Abhyankar jdx = bs*vi[k]; 292853cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 292953cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 293053cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 293153cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 293253cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 293353cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 293453cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 293553cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 293653cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 293753cca76cSShri Abhyankar v += bs2; 293853cca76cSShri Abhyankar } 293953cca76cSShri Abhyankar 294053cca76cSShri Abhyankar x[idx] = s1; 294153cca76cSShri Abhyankar x[1+idx] = s2; 294253cca76cSShri Abhyankar x[2+idx] = s3; 294353cca76cSShri Abhyankar x[3+idx] = s4; 294453cca76cSShri Abhyankar x[4+idx] = s5; 294553cca76cSShri Abhyankar x[5+idx] = s6; 294653cca76cSShri Abhyankar x[6+idx] = s7; 294753cca76cSShri Abhyankar } 294853cca76cSShri Abhyankar 294953cca76cSShri Abhyankar /* backward solve the upper triangular */ 295053cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 295153cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 295253cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 295353cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 295453cca76cSShri Abhyankar idt = bs*i; 295553cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 295653cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 295753cca76cSShri Abhyankar for(k=0;k<nz;k++) { 295853cca76cSShri Abhyankar idx = bs*vi[k]; 295953cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 296053cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 296153cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 296253cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 296353cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 296453cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 296553cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 296653cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 296753cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 296853cca76cSShri Abhyankar v += bs2; 296953cca76cSShri Abhyankar } 297053cca76cSShri Abhyankar /* x = inv_diagonal*x */ 297153cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 297253cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 297353cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 297453cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 297553cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 297653cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 297753cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 297853cca76cSShri Abhyankar } 297953cca76cSShri Abhyankar 29803649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 298153cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 298253cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 298353cca76cSShri Abhyankar PetscFunctionReturn(0); 298453cca76cSShri Abhyankar } 298553cca76cSShri Abhyankar 298653cca76cSShri Abhyankar #undef __FUNCT__ 298706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 298806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 298915091d37SBarry Smith { 299015091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 299115091d37SBarry Smith IS iscol=a->col,isrow=a->row; 29926849ba73SBarry Smith PetscErrorCode ierr; 29935d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 2994b3260449SShri Abhyankar const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2995b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 2996d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2997d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2998d9fead3dSBarry Smith const PetscScalar *b; 2999b3260449SShri Abhyankar 300015091d37SBarry Smith PetscFunctionBegin; 30013649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 30021ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3003f1af5d2fSBarry Smith t = a->solve_work; 300415091d37SBarry Smith 300515091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 300615091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 300715091d37SBarry Smith 300815091d37SBarry Smith /* forward solve the lower triangular */ 300915091d37SBarry Smith idx = 6*(*r++); 3010f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3011f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 3012f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 301315091d37SBarry Smith for (i=1; i<n; i++) { 301415091d37SBarry Smith v = aa + 36*ai[i]; 301515091d37SBarry Smith vi = aj + ai[i]; 301615091d37SBarry Smith nz = diag[i] - ai[i]; 301715091d37SBarry Smith idx = 6*(*r++); 3018f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3019f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 302015091d37SBarry Smith while (nz--) { 302115091d37SBarry Smith idx = 6*(*vi++); 3022f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3023f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 3024f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3025f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3026f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3027f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3028f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3029f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 303015091d37SBarry Smith v += 36; 303115091d37SBarry Smith } 303215091d37SBarry Smith idx = 6*i; 3033f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3034f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 3035f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 303615091d37SBarry Smith } 303715091d37SBarry Smith /* backward solve the upper triangular */ 303815091d37SBarry Smith for (i=n-1; i>=0; i--){ 303915091d37SBarry Smith v = aa + 36*diag[i] + 36; 304015091d37SBarry Smith vi = aj + diag[i] + 1; 304115091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 304215091d37SBarry Smith idt = 6*i; 3043f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3044f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 3045f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 304615091d37SBarry Smith while (nz--) { 304715091d37SBarry Smith idx = 6*(*vi++); 3048f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3049f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 3050f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 3051f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3052f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3053f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3054f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3055f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3056f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 305715091d37SBarry Smith v += 36; 305815091d37SBarry Smith } 305915091d37SBarry Smith idc = 6*(*c--); 306015091d37SBarry Smith v = aa + 36*diag[i]; 3061f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3062f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 3063f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3064f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 3065f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3066f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 3067f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3068f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 3069f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3070f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 3071f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3072f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 307315091d37SBarry Smith } 307415091d37SBarry Smith 307515091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 307615091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 30773649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 30781ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3079dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 308015091d37SBarry Smith PetscFunctionReturn(0); 308115091d37SBarry Smith } 308215091d37SBarry Smith 30836506fda5SShri Abhyankar #undef __FUNCT__ 30844dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6" 30854dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 30866506fda5SShri Abhyankar { 30876506fda5SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 30886506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 30896506fda5SShri Abhyankar PetscErrorCode ierr; 30906506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 3091b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3092b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 30936506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 30946506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 30956506fda5SShri Abhyankar const PetscScalar *b; 3096b3260449SShri Abhyankar 30976506fda5SShri Abhyankar PetscFunctionBegin; 30983649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 30996506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 31006506fda5SShri Abhyankar t = a->solve_work; 31016506fda5SShri Abhyankar 31026506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 31036506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 31046506fda5SShri Abhyankar 31056506fda5SShri Abhyankar /* forward solve the lower triangular */ 31066506fda5SShri Abhyankar idx = 6*r[0]; 31076506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 31086506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 31096506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 31106506fda5SShri Abhyankar for (i=1; i<n; i++) { 31116506fda5SShri Abhyankar v = aa + 36*ai[i]; 31126506fda5SShri Abhyankar vi = aj + ai[i]; 31136506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 31146506fda5SShri Abhyankar idx = 6*r[i]; 31156506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 31166506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 31176506fda5SShri Abhyankar for(m=0;m<nz;m++){ 31186506fda5SShri Abhyankar idx = 6*vi[m]; 31196506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 31206506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 31216506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 31226506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 31236506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 31246506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 31256506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 31266506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 31276506fda5SShri Abhyankar v += 36; 31286506fda5SShri Abhyankar } 31296506fda5SShri Abhyankar idx = 6*i; 31306506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 31316506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 31326506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 31336506fda5SShri Abhyankar } 31346506fda5SShri Abhyankar /* backward solve the upper triangular */ 31356506fda5SShri Abhyankar for (i=n-1; i>=0; i--){ 31366506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 31376506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 31386506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 31396506fda5SShri Abhyankar idt = 6*i; 31406506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 31416506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 31426506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 31436506fda5SShri Abhyankar for(m=0;m<nz;m++){ 31446506fda5SShri Abhyankar idx = 6*vi[m]; 31456506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 31466506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 31476506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 31486506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 31496506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 31506506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 31516506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 31526506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 31536506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 31546506fda5SShri Abhyankar v += 36; 31556506fda5SShri Abhyankar } 31566506fda5SShri Abhyankar idc = 6*c[i]; 31576506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 31586506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 31596506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 31606506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 31616506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 31626506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 31636506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 31646506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 31656506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 31666506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 31676506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 31686506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 31696506fda5SShri Abhyankar } 31706506fda5SShri Abhyankar 31716506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 31726506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 31733649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 31746506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 31756506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 31766506fda5SShri Abhyankar PetscFunctionReturn(0); 31776506fda5SShri Abhyankar } 31788f690400SShri Abhyankar 31798f690400SShri Abhyankar #undef __FUNCT__ 318006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 318106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 318215091d37SBarry Smith { 318315091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3184b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 3185dfbe8321SBarry Smith PetscErrorCode ierr; 3186b3260449SShri Abhyankar const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 3187d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3188d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3189d9fead3dSBarry Smith const PetscScalar *b; 319015091d37SBarry Smith 319115091d37SBarry Smith PetscFunctionBegin; 31923649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 31931ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 319415091d37SBarry Smith /* forward solve the lower triangular */ 319515091d37SBarry Smith idx = 0; 319615091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 319715091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 319815091d37SBarry Smith for (i=1; i<n; i++) { 319915091d37SBarry Smith v = aa + 36*ai[i]; 320015091d37SBarry Smith vi = aj + ai[i]; 320115091d37SBarry Smith nz = diag[i] - ai[i]; 320215091d37SBarry Smith idx = 6*i; 3203f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3204f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 320515091d37SBarry Smith while (nz--) { 320615091d37SBarry Smith jdx = 6*(*vi++); 320715091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 320815091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 3209f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3210f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3211f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3212f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3213f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3214f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 321515091d37SBarry Smith v += 36; 321615091d37SBarry Smith } 3217f1af5d2fSBarry Smith x[idx] = s1; 3218f1af5d2fSBarry Smith x[1+idx] = s2; 3219f1af5d2fSBarry Smith x[2+idx] = s3; 3220f1af5d2fSBarry Smith x[3+idx] = s4; 3221f1af5d2fSBarry Smith x[4+idx] = s5; 3222f1af5d2fSBarry Smith x[5+idx] = s6; 322315091d37SBarry Smith } 322415091d37SBarry Smith /* backward solve the upper triangular */ 322515091d37SBarry Smith for (i=n-1; i>=0; i--){ 322615091d37SBarry Smith v = aa + 36*diag[i] + 36; 322715091d37SBarry Smith vi = aj + diag[i] + 1; 322815091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 322915091d37SBarry Smith idt = 6*i; 3230f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3231f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 3232f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 323315091d37SBarry Smith while (nz--) { 323415091d37SBarry Smith idx = 6*(*vi++); 323515091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 323615091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3237f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3238f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3239f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3240f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3241f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3242f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 324315091d37SBarry Smith v += 36; 324415091d37SBarry Smith } 324515091d37SBarry Smith v = aa + 36*diag[i]; 3246f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3247f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3248f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3249f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3250f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3251f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 325215091d37SBarry Smith } 325315091d37SBarry Smith 32543649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 32551ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3256dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 325715091d37SBarry Smith PetscFunctionReturn(0); 325815091d37SBarry Smith } 325915091d37SBarry Smith 3260cee9d6f2SShri Abhyankar #undef __FUNCT__ 32614dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 32624dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 326353cca76cSShri Abhyankar { 326453cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3265b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 326653cca76cSShri Abhyankar PetscErrorCode ierr; 3267b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 3268b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 326953cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 327053cca76cSShri Abhyankar PetscScalar *x; 327153cca76cSShri Abhyankar const PetscScalar *b; 327253cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 327353cca76cSShri Abhyankar 327453cca76cSShri Abhyankar PetscFunctionBegin; 32753649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 327653cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 327753cca76cSShri Abhyankar /* forward solve the lower triangular */ 327853cca76cSShri Abhyankar idx = 0; 327953cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 328053cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 328153cca76cSShri Abhyankar for (i=1; i<n; i++) { 328253cca76cSShri Abhyankar v = aa + bs2*ai[i]; 328353cca76cSShri Abhyankar vi = aj + ai[i]; 328453cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 328553cca76cSShri Abhyankar idx = bs*i; 328653cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 328753cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 328853cca76cSShri Abhyankar for(k=0;k<nz;k++){ 328953cca76cSShri Abhyankar jdx = bs*vi[k]; 329053cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 329153cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 329253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 329353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 329453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 329553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 329653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 329753cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 329853cca76cSShri Abhyankar v += bs2; 329953cca76cSShri Abhyankar } 330053cca76cSShri Abhyankar 330153cca76cSShri Abhyankar x[idx] = s1; 330253cca76cSShri Abhyankar x[1+idx] = s2; 330353cca76cSShri Abhyankar x[2+idx] = s3; 330453cca76cSShri Abhyankar x[3+idx] = s4; 330553cca76cSShri Abhyankar x[4+idx] = s5; 330653cca76cSShri Abhyankar x[5+idx] = s6; 330753cca76cSShri Abhyankar } 330853cca76cSShri Abhyankar 330953cca76cSShri Abhyankar /* backward solve the upper triangular */ 331053cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 331153cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 331253cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 331353cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 331453cca76cSShri Abhyankar idt = bs*i; 331553cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 331653cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 331753cca76cSShri Abhyankar for(k=0;k<nz;k++){ 331853cca76cSShri Abhyankar idx = bs*vi[k]; 331953cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 332053cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 332153cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 332253cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 332353cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 332453cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 332553cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 332653cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 332753cca76cSShri Abhyankar v += bs2; 332853cca76cSShri Abhyankar } 332953cca76cSShri Abhyankar /* x = inv_diagonal*x */ 333053cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 333153cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 333253cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 333353cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 333453cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 333553cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 333653cca76cSShri Abhyankar } 333753cca76cSShri Abhyankar 33383649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 333953cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 334053cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 334153cca76cSShri Abhyankar PetscFunctionReturn(0); 334253cca76cSShri Abhyankar } 334353cca76cSShri Abhyankar 334453cca76cSShri Abhyankar #undef __FUNCT__ 334506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 334606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 33474e2b4712SSatish Balay { 33484e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 33494e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 33506849ba73SBarry Smith PetscErrorCode ierr; 33515d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3352b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3353b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 3354d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3355d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3356d9fead3dSBarry Smith const PetscScalar *b; 33574e2b4712SSatish Balay 33584e2b4712SSatish Balay PetscFunctionBegin; 33593649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 33601ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3361f1af5d2fSBarry Smith t = a->solve_work; 33624e2b4712SSatish Balay 33634e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 33644e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 33654e2b4712SSatish Balay 33664e2b4712SSatish Balay /* forward solve the lower triangular */ 33674e2b4712SSatish Balay idx = 5*(*r++); 3368f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3369f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 33704e2b4712SSatish Balay for (i=1; i<n; i++) { 33714e2b4712SSatish Balay v = aa + 25*ai[i]; 33724e2b4712SSatish Balay vi = aj + ai[i]; 33734e2b4712SSatish Balay nz = diag[i] - ai[i]; 33744e2b4712SSatish Balay idx = 5*(*r++); 3375f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3376f1af5d2fSBarry Smith s5 = b[4+idx]; 33774e2b4712SSatish Balay while (nz--) { 33784e2b4712SSatish Balay idx = 5*(*vi++); 3379f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3380f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 3381f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3382f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3383f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3384f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3385f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 33864e2b4712SSatish Balay v += 25; 33874e2b4712SSatish Balay } 33884e2b4712SSatish Balay idx = 5*i; 3389f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3390f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 33914e2b4712SSatish Balay } 33924e2b4712SSatish Balay /* backward solve the upper triangular */ 33934e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 33944e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 33954e2b4712SSatish Balay vi = aj + diag[i] + 1; 33964e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 33974e2b4712SSatish Balay idt = 5*i; 3398f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3399f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 34004e2b4712SSatish Balay while (nz--) { 34014e2b4712SSatish Balay idx = 5*(*vi++); 3402f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3403f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3404f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3405f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3406f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3407f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3408f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 34094e2b4712SSatish Balay v += 25; 34104e2b4712SSatish Balay } 34114e2b4712SSatish Balay idc = 5*(*c--); 34124e2b4712SSatish Balay v = aa + 25*diag[i]; 3413f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3414f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 3415f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3416f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 3417f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3418f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 3419f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3420f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 3421f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3422f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 34234e2b4712SSatish Balay } 34244e2b4712SSatish Balay 34254e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 34264e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 34273649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 34281ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3429dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 34304e2b4712SSatish Balay PetscFunctionReturn(0); 34314e2b4712SSatish Balay } 34324e2b4712SSatish Balay 343378bb4007SShri Abhyankar #undef __FUNCT__ 34344dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5" 34354dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 343678bb4007SShri Abhyankar { 343778bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 343878bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 343978bb4007SShri Abhyankar PetscErrorCode ierr; 344078bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 3441b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3442b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 344378bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 344478bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 344578bb4007SShri Abhyankar const PetscScalar *b; 344678bb4007SShri Abhyankar 344778bb4007SShri Abhyankar PetscFunctionBegin; 34483649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 344978bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 345078bb4007SShri Abhyankar t = a->solve_work; 345178bb4007SShri Abhyankar 345278bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 345378bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 345478bb4007SShri Abhyankar 345578bb4007SShri Abhyankar /* forward solve the lower triangular */ 345678bb4007SShri Abhyankar idx = 5*r[0]; 345778bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 345878bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 345978bb4007SShri Abhyankar for (i=1; i<n; i++) { 346078bb4007SShri Abhyankar v = aa + 25*ai[i]; 346178bb4007SShri Abhyankar vi = aj + ai[i]; 346278bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 346378bb4007SShri Abhyankar idx = 5*r[i]; 346478bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 346578bb4007SShri Abhyankar s5 = b[4+idx]; 346678bb4007SShri Abhyankar for(m=0;m<nz;m++){ 346778bb4007SShri Abhyankar idx = 5*vi[m]; 346878bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 346978bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 347078bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 347178bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 347278bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 347378bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 347478bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 347578bb4007SShri Abhyankar v += 25; 347678bb4007SShri Abhyankar } 347778bb4007SShri Abhyankar idx = 5*i; 347878bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 347978bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 348078bb4007SShri Abhyankar } 348178bb4007SShri Abhyankar /* backward solve the upper triangular */ 348278bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 348378bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 348478bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 348578bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 348678bb4007SShri Abhyankar idt = 5*i; 348778bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 348878bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 348978bb4007SShri Abhyankar for(m=0;m<nz;m++){ 349078bb4007SShri Abhyankar idx = 5*vi[m]; 349178bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 349278bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 349378bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 349478bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 349578bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 349678bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 349778bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 349878bb4007SShri Abhyankar v += 25; 349978bb4007SShri Abhyankar } 350078bb4007SShri Abhyankar idc = 5*c[i]; 350178bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 350278bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 350378bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 350478bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 350578bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 350678bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 350778bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 350878bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 350978bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 351078bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 351178bb4007SShri Abhyankar } 351278bb4007SShri Abhyankar 351378bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 351478bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 35153649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 351678bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 351778bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 351878bb4007SShri Abhyankar PetscFunctionReturn(0); 351978bb4007SShri Abhyankar } 352078bb4007SShri Abhyankar 35218f690400SShri Abhyankar #undef __FUNCT__ 352206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 352306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 352415091d37SBarry Smith { 352515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3526b3260449SShri Abhyankar const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3527b3260449SShri Abhyankar PetscInt i,nz,idx,idt,jdx; 3528dfbe8321SBarry Smith PetscErrorCode ierr; 3529d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3530d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3531d9fead3dSBarry Smith const PetscScalar *b; 353215091d37SBarry Smith 353315091d37SBarry Smith PetscFunctionBegin; 35343649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 35351ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 353615091d37SBarry Smith /* forward solve the lower triangular */ 353715091d37SBarry Smith idx = 0; 353815091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 353915091d37SBarry Smith for (i=1; i<n; i++) { 354015091d37SBarry Smith v = aa + 25*ai[i]; 354115091d37SBarry Smith vi = aj + ai[i]; 354215091d37SBarry Smith nz = diag[i] - ai[i]; 354315091d37SBarry Smith idx = 5*i; 3544f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 354515091d37SBarry Smith while (nz--) { 354615091d37SBarry Smith jdx = 5*(*vi++); 354715091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3548f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3549f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3550f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3551f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3552f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 355315091d37SBarry Smith v += 25; 355415091d37SBarry Smith } 3555f1af5d2fSBarry Smith x[idx] = s1; 3556f1af5d2fSBarry Smith x[1+idx] = s2; 3557f1af5d2fSBarry Smith x[2+idx] = s3; 3558f1af5d2fSBarry Smith x[3+idx] = s4; 3559f1af5d2fSBarry Smith x[4+idx] = s5; 356015091d37SBarry Smith } 356115091d37SBarry Smith /* backward solve the upper triangular */ 356215091d37SBarry Smith for (i=n-1; i>=0; i--){ 356315091d37SBarry Smith v = aa + 25*diag[i] + 25; 356415091d37SBarry Smith vi = aj + diag[i] + 1; 356515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 356615091d37SBarry Smith idt = 5*i; 3567f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3568f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 356915091d37SBarry Smith while (nz--) { 357015091d37SBarry Smith idx = 5*(*vi++); 357115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3572f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3573f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3574f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3575f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3576f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 357715091d37SBarry Smith v += 25; 357815091d37SBarry Smith } 357915091d37SBarry Smith v = aa + 25*diag[i]; 3580f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3581f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3582f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3583f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3584f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 358515091d37SBarry Smith } 358615091d37SBarry Smith 35873649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 35881ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3589dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 359015091d37SBarry Smith PetscFunctionReturn(0); 359115091d37SBarry Smith } 359215091d37SBarry Smith 3593cee9d6f2SShri Abhyankar #undef __FUNCT__ 35944dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 35954dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 359653cca76cSShri Abhyankar { 359753cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3598b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3599b3260449SShri Abhyankar PetscInt i,k,nz,idx,idt,jdx; 360053cca76cSShri Abhyankar PetscErrorCode ierr; 360153cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 360253cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 360353cca76cSShri Abhyankar const PetscScalar *b; 360453cca76cSShri Abhyankar 360553cca76cSShri Abhyankar PetscFunctionBegin; 36063649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 360753cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 360853cca76cSShri Abhyankar /* forward solve the lower triangular */ 360953cca76cSShri Abhyankar idx = 0; 361053cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 361153cca76cSShri Abhyankar for (i=1; i<n; i++) { 361253cca76cSShri Abhyankar v = aa + 25*ai[i]; 361353cca76cSShri Abhyankar vi = aj + ai[i]; 361453cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 361553cca76cSShri Abhyankar idx = 5*i; 361653cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 361753cca76cSShri Abhyankar for(k=0;k<nz;k++) { 361853cca76cSShri Abhyankar jdx = 5*vi[k]; 361953cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 362053cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 362153cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 362253cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 362353cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 362453cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 362553cca76cSShri Abhyankar v += 25; 362653cca76cSShri Abhyankar } 362753cca76cSShri Abhyankar x[idx] = s1; 362853cca76cSShri Abhyankar x[1+idx] = s2; 362953cca76cSShri Abhyankar x[2+idx] = s3; 363053cca76cSShri Abhyankar x[3+idx] = s4; 363153cca76cSShri Abhyankar x[4+idx] = s5; 363253cca76cSShri Abhyankar } 363353cca76cSShri Abhyankar 363453cca76cSShri Abhyankar /* backward solve the upper triangular */ 363553cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 363653cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 363753cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 363853cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 363953cca76cSShri Abhyankar idt = 5*i; 364053cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 364153cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 364253cca76cSShri Abhyankar for(k=0;k<nz;k++){ 364353cca76cSShri Abhyankar idx = 5*vi[k]; 364453cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 364553cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 364653cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 364753cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 364853cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 364953cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 365053cca76cSShri Abhyankar v += 25; 365153cca76cSShri Abhyankar } 365253cca76cSShri Abhyankar /* x = inv_diagonal*x */ 365353cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 365453cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 365553cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 365653cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 365753cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 365853cca76cSShri Abhyankar } 365953cca76cSShri Abhyankar 36603649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 366153cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 366253cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 366353cca76cSShri Abhyankar PetscFunctionReturn(0); 366453cca76cSShri Abhyankar } 366553cca76cSShri Abhyankar 366653cca76cSShri Abhyankar #undef __FUNCT__ 366706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 366806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 36694e2b4712SSatish Balay { 36704e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 36714e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 36726849ba73SBarry Smith PetscErrorCode ierr; 3673b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3674b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 36755d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3676d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3677d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3678d9fead3dSBarry Smith const PetscScalar *b; 36794e2b4712SSatish Balay 36804e2b4712SSatish Balay PetscFunctionBegin; 36813649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 36821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3683f1af5d2fSBarry Smith t = a->solve_work; 36844e2b4712SSatish Balay 36854e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 36864e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 36874e2b4712SSatish Balay 36884e2b4712SSatish Balay /* forward solve the lower triangular */ 36894e2b4712SSatish Balay idx = 4*(*r++); 3690f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 3691f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 36924e2b4712SSatish Balay for (i=1; i<n; i++) { 36934e2b4712SSatish Balay v = aa + 16*ai[i]; 36944e2b4712SSatish Balay vi = aj + ai[i]; 36954e2b4712SSatish Balay nz = diag[i] - ai[i]; 36964e2b4712SSatish Balay idx = 4*(*r++); 3697f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 36984e2b4712SSatish Balay while (nz--) { 36994e2b4712SSatish Balay idx = 4*(*vi++); 3700f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3701f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3702f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3703f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3704f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 37054e2b4712SSatish Balay v += 16; 37064e2b4712SSatish Balay } 37074e2b4712SSatish Balay idx = 4*i; 3708f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 3709f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 37104e2b4712SSatish Balay } 37114e2b4712SSatish Balay /* backward solve the upper triangular */ 37124e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 37134e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 37144e2b4712SSatish Balay vi = aj + diag[i] + 1; 37154e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 37164e2b4712SSatish Balay idt = 4*i; 3717f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 3718f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 37194e2b4712SSatish Balay while (nz--) { 37204e2b4712SSatish Balay idx = 4*(*vi++); 3721f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 3722f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 3723f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3724f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3725f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3726f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 37274e2b4712SSatish Balay v += 16; 37284e2b4712SSatish Balay } 37294e2b4712SSatish Balay idc = 4*(*c--); 37304e2b4712SSatish Balay v = aa + 16*diag[i]; 3731f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3732f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3733f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3734f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 37354e2b4712SSatish Balay } 37364e2b4712SSatish Balay 37374e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 37384e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 37393649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 37401ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3741dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 37424e2b4712SSatish Balay PetscFunctionReturn(0); 37434e2b4712SSatish Balay } 3744f26ec98cSKris Buschelman 37458f690400SShri Abhyankar #undef __FUNCT__ 37464dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4" 37474dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 374878bb4007SShri Abhyankar { 374978bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 375078bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 375178bb4007SShri Abhyankar PetscErrorCode ierr; 3752b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3753b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 375478bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 375578bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 375678bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 375778bb4007SShri Abhyankar const PetscScalar *b; 375878bb4007SShri Abhyankar 375978bb4007SShri Abhyankar PetscFunctionBegin; 37603649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 376178bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 376278bb4007SShri Abhyankar t = a->solve_work; 376378bb4007SShri Abhyankar 376478bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 376578bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 376678bb4007SShri Abhyankar 376778bb4007SShri Abhyankar /* forward solve the lower triangular */ 376878bb4007SShri Abhyankar idx = 4*r[0]; 376978bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 377078bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 377178bb4007SShri Abhyankar for (i=1; i<n; i++) { 377278bb4007SShri Abhyankar v = aa + 16*ai[i]; 377378bb4007SShri Abhyankar vi = aj + ai[i]; 377478bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 377578bb4007SShri Abhyankar idx = 4*r[i]; 377678bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 377778bb4007SShri Abhyankar for(m=0;m<nz;m++){ 377878bb4007SShri Abhyankar idx = 4*vi[m]; 377978bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 378078bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 378178bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 378278bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 378378bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 378478bb4007SShri Abhyankar v += 16; 378578bb4007SShri Abhyankar } 378678bb4007SShri Abhyankar idx = 4*i; 378778bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 378878bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 378978bb4007SShri Abhyankar } 379078bb4007SShri Abhyankar /* backward solve the upper triangular */ 379178bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 379278bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 379378bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 379478bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 379578bb4007SShri Abhyankar idt = 4*i; 379678bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 379778bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 379878bb4007SShri Abhyankar for(m=0;m<nz;m++){ 379978bb4007SShri Abhyankar idx = 4*vi[m]; 380078bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 380178bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 380278bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 380378bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 380478bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 380578bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 380678bb4007SShri Abhyankar v += 16; 380778bb4007SShri Abhyankar } 380878bb4007SShri Abhyankar idc = 4*c[i]; 380978bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 381078bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 381178bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 381278bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 381378bb4007SShri Abhyankar } 381478bb4007SShri Abhyankar 381578bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 381678bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 38173649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 381878bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 381978bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 382078bb4007SShri Abhyankar PetscFunctionReturn(0); 382178bb4007SShri Abhyankar } 382278bb4007SShri Abhyankar 382378bb4007SShri Abhyankar #undef __FUNCT__ 3824f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3825dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3826f26ec98cSKris Buschelman { 3827f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3828f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 38296849ba73SBarry Smith PetscErrorCode ierr; 3830b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3831b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 38325d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3833d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3834d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3835d9fead3dSBarry Smith PetscScalar *x; 3836d9fead3dSBarry Smith const PetscScalar *b; 3837f26ec98cSKris Buschelman 3838f26ec98cSKris Buschelman PetscFunctionBegin; 38393649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 38401ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3841f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 3842f26ec98cSKris Buschelman 3843f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3844f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3845f26ec98cSKris Buschelman 3846f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3847f26ec98cSKris Buschelman idx = 4*(*r++); 3848f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 3849f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 3850f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 3851f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 3852f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3853f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3854f26ec98cSKris Buschelman vi = aj + ai[i]; 3855f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3856f26ec98cSKris Buschelman idx = 4*(*r++); 3857f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3858f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3859f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3860f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3861f26ec98cSKris Buschelman while (nz--) { 3862f26ec98cSKris Buschelman idx = 4*(*vi++); 3863f26ec98cSKris Buschelman x1 = t[idx]; 3864f26ec98cSKris Buschelman x2 = t[1+idx]; 3865f26ec98cSKris Buschelman x3 = t[2+idx]; 3866f26ec98cSKris Buschelman x4 = t[3+idx]; 3867f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3868f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3869f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3870f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3871f26ec98cSKris Buschelman v += 16; 3872f26ec98cSKris Buschelman } 3873f26ec98cSKris Buschelman idx = 4*i; 3874f26ec98cSKris Buschelman t[idx] = s1; 3875f26ec98cSKris Buschelman t[1+idx] = s2; 3876f26ec98cSKris Buschelman t[2+idx] = s3; 3877f26ec98cSKris Buschelman t[3+idx] = s4; 3878f26ec98cSKris Buschelman } 3879f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3880f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3881f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 3882f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3883f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3884f26ec98cSKris Buschelman idt = 4*i; 3885f26ec98cSKris Buschelman s1 = t[idt]; 3886f26ec98cSKris Buschelman s2 = t[1+idt]; 3887f26ec98cSKris Buschelman s3 = t[2+idt]; 3888f26ec98cSKris Buschelman s4 = t[3+idt]; 3889f26ec98cSKris Buschelman while (nz--) { 3890f26ec98cSKris Buschelman idx = 4*(*vi++); 3891f26ec98cSKris Buschelman x1 = t[idx]; 3892f26ec98cSKris Buschelman x2 = t[1+idx]; 3893f26ec98cSKris Buschelman x3 = t[2+idx]; 3894f26ec98cSKris Buschelman x4 = t[3+idx]; 3895f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3896f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3897f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3898f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3899f26ec98cSKris Buschelman v += 16; 3900f26ec98cSKris Buschelman } 3901f26ec98cSKris Buschelman idc = 4*(*c--); 3902f26ec98cSKris Buschelman v = aa + 16*diag[i]; 3903f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3904f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3905f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3906f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3907f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 3908f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 3909f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 3910f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 3911f26ec98cSKris Buschelman } 3912f26ec98cSKris Buschelman 3913f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3914f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 39153649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 39161ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3917dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3918f26ec98cSKris Buschelman PetscFunctionReturn(0); 3919f26ec98cSKris Buschelman } 3920f26ec98cSKris Buschelman 392124c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 392224c233c2SKris Buschelman 392324c233c2SKris Buschelman #include PETSC_HAVE_SSE 392424c233c2SKris Buschelman 392524c233c2SKris Buschelman #undef __FUNCT__ 392624c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3927dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 392824c233c2SKris Buschelman { 392924c233c2SKris Buschelman /* 393024c233c2SKris Buschelman Note: This code uses demotion of double 393124c233c2SKris Buschelman to float when performing the mixed-mode computation. 393224c233c2SKris Buschelman This may not be numerically reasonable for all applications. 393324c233c2SKris Buschelman */ 393424c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 393524c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 39366849ba73SBarry Smith PetscErrorCode ierr; 39375d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 39385d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 393924c233c2SKris Buschelman MatScalar *aa=a->a,*v; 394087828ca2SBarry Smith PetscScalar *x,*b,*t; 394124c233c2SKris Buschelman 394224c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 394324c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 394424c233c2SKris Buschelman unsigned long offset; 394524c233c2SKris Buschelman 394624c233c2SKris Buschelman PetscFunctionBegin; 394724c233c2SKris Buschelman SSE_SCOPE_BEGIN; 394824c233c2SKris Buschelman 394924c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 395024c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 395124c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 395224c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 395324c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 395424c233c2SKris Buschelman 39551ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 39561ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 395724c233c2SKris Buschelman t = a->solve_work; 395824c233c2SKris Buschelman 395924c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 396024c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 396124c233c2SKris Buschelman 396224c233c2SKris Buschelman /* forward solve the lower triangular */ 396324c233c2SKris Buschelman idx = 4*(*r++); 396424c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 396524c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 396624c233c2SKris Buschelman v = aa + 16*ai[1]; 396724c233c2SKris Buschelman 396824c233c2SKris Buschelman for (i=1; i<n;) { 396924c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 397024c233c2SKris Buschelman vi = aj + ai[i]; 397124c233c2SKris Buschelman nz = diag[i] - ai[i]; 397224c233c2SKris Buschelman idx = 4*(*r++); 397324c233c2SKris Buschelman 397424c233c2SKris Buschelman /* Demote sum from double to float */ 397524c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 397624c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 397724c233c2SKris Buschelman 397824c233c2SKris Buschelman while (nz--) { 397924c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 398024c233c2SKris Buschelman idx = 4*(*vi++); 398124c233c2SKris Buschelman 398224c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 398324c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 398424c233c2SKris Buschelman 398524c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 398624c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 398724c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 398824c233c2SKris Buschelman 398924c233c2SKris Buschelman /* First Column */ 399024c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 399124c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 399224c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 399324c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 399424c233c2SKris Buschelman 399524c233c2SKris Buschelman /* Second Column */ 399624c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 399724c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 399824c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 399924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 400024c233c2SKris Buschelman 400124c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 400224c233c2SKris Buschelman 400324c233c2SKris Buschelman /* Third Column */ 400424c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 400524c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 400624c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 400724c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 400824c233c2SKris Buschelman 400924c233c2SKris Buschelman /* Fourth Column */ 401024c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 401124c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 401224c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 401324c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 401424c233c2SKris Buschelman SSE_INLINE_END_2 401524c233c2SKris Buschelman 401624c233c2SKris Buschelman v += 16; 401724c233c2SKris Buschelman } 401824c233c2SKris Buschelman idx = 4*i; 401924c233c2SKris Buschelman v = aa + 16*ai[++i]; 402024c233c2SKris Buschelman PREFETCH_NTA(v); 402124c233c2SKris Buschelman STORE_PS(tmps,XMM7); 402224c233c2SKris Buschelman 402324c233c2SKris Buschelman /* Promote result from float to double */ 402424c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 402524c233c2SKris Buschelman } 402624c233c2SKris Buschelman /* backward solve the upper triangular */ 402724c233c2SKris Buschelman idt = 4*(n-1); 402824c233c2SKris Buschelman ai16 = 16*diag[n-1]; 402924c233c2SKris Buschelman v = aa + ai16 + 16; 403024c233c2SKris Buschelman for (i=n-1; i>=0;){ 403124c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 403224c233c2SKris Buschelman vi = aj + diag[i] + 1; 403324c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 403424c233c2SKris Buschelman 403524c233c2SKris Buschelman /* Demote accumulator from double to float */ 403624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 403724c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 403824c233c2SKris Buschelman 403924c233c2SKris Buschelman while (nz--) { 404024c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 404124c233c2SKris Buschelman idx = 4*(*vi++); 404224c233c2SKris Buschelman 404324c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 404424c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 404524c233c2SKris Buschelman 404624c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 404724c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 404824c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 404924c233c2SKris Buschelman 405024c233c2SKris Buschelman /* First Column */ 405124c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 405224c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 405324c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 405424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 405524c233c2SKris Buschelman 405624c233c2SKris Buschelman /* Second Column */ 405724c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 405824c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 405924c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 406024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 406124c233c2SKris Buschelman 406224c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 406324c233c2SKris Buschelman 406424c233c2SKris Buschelman /* Third Column */ 406524c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 406624c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 406724c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 406824c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 406924c233c2SKris Buschelman 407024c233c2SKris Buschelman /* Fourth Column */ 407124c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 407224c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 407324c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 407424c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 407524c233c2SKris Buschelman SSE_INLINE_END_2 407624c233c2SKris Buschelman v += 16; 407724c233c2SKris Buschelman } 407824c233c2SKris Buschelman v = aa + ai16; 407924c233c2SKris Buschelman ai16 = 16*diag[--i]; 408024c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 408124c233c2SKris Buschelman /* 408224c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 408324c233c2SKris Buschelman which was inverted as part of the factorization 408424c233c2SKris Buschelman */ 408524c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 408624c233c2SKris Buschelman /* First Column */ 408724c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 408824c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 408924c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 409024c233c2SKris Buschelman 409124c233c2SKris Buschelman /* Second Column */ 409224c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 409324c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 409424c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 409524c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 409624c233c2SKris Buschelman 409724c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 409824c233c2SKris Buschelman 409924c233c2SKris Buschelman /* Third Column */ 410024c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 410124c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 410224c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 410324c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 410424c233c2SKris Buschelman 410524c233c2SKris Buschelman /* Fourth Column */ 410624c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 410724c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 410824c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 410924c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 411024c233c2SKris Buschelman 411124c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 411224c233c2SKris Buschelman SSE_INLINE_END_3 411324c233c2SKris Buschelman 411424c233c2SKris Buschelman /* Promote solution from float to double */ 411524c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 411624c233c2SKris Buschelman 411724c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 411824c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 411924c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 412024c233c2SKris Buschelman idc = 4*(*c--); 412124c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 412224c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 412324c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 412424c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 412524c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 412624c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 412724c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 412824c233c2SKris Buschelman SSE_INLINE_END_2 412924c233c2SKris Buschelman v = aa + ai16 + 16; 413024c233c2SKris Buschelman idt -= 4; 413124c233c2SKris Buschelman } 413224c233c2SKris Buschelman 413324c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 413424c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 41351ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 41361ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4137dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 413824c233c2SKris Buschelman SSE_SCOPE_END; 413924c233c2SKris Buschelman PetscFunctionReturn(0); 414024c233c2SKris Buschelman } 414124c233c2SKris Buschelman 414224c233c2SKris Buschelman #endif 41430ef38995SBarry Smith 41440ef38995SBarry Smith 41454e2b4712SSatish Balay /* 41464e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 41474e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 41484e2b4712SSatish Balay */ 41494a2ae208SSatish Balay #undef __FUNCT__ 415006e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 415106e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 41524e2b4712SSatish Balay { 41534e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4154356650c2SBarry Smith PetscInt n=a->mbs; 4155356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 4156dfbe8321SBarry Smith PetscErrorCode ierr; 4157356650c2SBarry Smith const PetscInt *diag = a->diag; 4158d9fead3dSBarry Smith const MatScalar *aa=a->a; 4159d9fead3dSBarry Smith PetscScalar *x; 4160d9fead3dSBarry Smith const PetscScalar *b; 41614e2b4712SSatish Balay 41624e2b4712SSatish Balay PetscFunctionBegin; 41633649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 41641ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 41654e2b4712SSatish Balay 4166aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 41672853dc0eSBarry Smith { 416887828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 41692853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 41702853dc0eSBarry Smith } 4171aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 41722853dc0eSBarry Smith { 417387828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 41742853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 41752853dc0eSBarry Smith } 4176aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 41772853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 4178e1293385SBarry Smith #else 417930d4dcafSBarry Smith { 418087828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4181d9fead3dSBarry Smith const MatScalar *v; 4182356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 4183356650c2SBarry Smith const PetscInt *vi; 4184e1293385SBarry Smith 41854e2b4712SSatish Balay /* forward solve the lower triangular */ 41864e2b4712SSatish Balay idx = 0; 4187e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 41884e2b4712SSatish Balay for (i=1; i<n; i++) { 41894e2b4712SSatish Balay v = aa + 16*ai[i]; 41904e2b4712SSatish Balay vi = aj + ai[i]; 41914e2b4712SSatish Balay nz = diag[i] - ai[i]; 4192e1293385SBarry Smith idx += 4; 4193f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 41944e2b4712SSatish Balay while (nz--) { 41954e2b4712SSatish Balay jdx = 4*(*vi++); 41964e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 4197f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4198f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4199f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4200f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 42014e2b4712SSatish Balay v += 16; 42024e2b4712SSatish Balay } 4203f1af5d2fSBarry Smith x[idx] = s1; 4204f1af5d2fSBarry Smith x[1+idx] = s2; 4205f1af5d2fSBarry Smith x[2+idx] = s3; 4206f1af5d2fSBarry Smith x[3+idx] = s4; 42074e2b4712SSatish Balay } 42084e2b4712SSatish Balay /* backward solve the upper triangular */ 42094e555682SBarry Smith idt = 4*(n-1); 42104e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 42114e555682SBarry Smith ai16 = 16*diag[i]; 42124e555682SBarry Smith v = aa + ai16 + 16; 42134e2b4712SSatish Balay vi = aj + diag[i] + 1; 42144e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4215f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4216f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 42174e2b4712SSatish Balay while (nz--) { 42184e2b4712SSatish Balay idx = 4*(*vi++); 42194e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 4220f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4221f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4222f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4223f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 42244e2b4712SSatish Balay v += 16; 42254e2b4712SSatish Balay } 42264e555682SBarry Smith v = aa + ai16; 4227f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4228f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4229f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4230f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4231329f5518SBarry Smith idt -= 4; 42324e2b4712SSatish Balay } 423330d4dcafSBarry Smith } 4234e1293385SBarry Smith #endif 42354e2b4712SSatish Balay 42363649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 42371ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4238dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 42394e2b4712SSatish Balay PetscFunctionReturn(0); 42404e2b4712SSatish Balay } 42414e2b4712SSatish Balay 4242b2b2dd24SShri Abhyankar #undef __FUNCT__ 42434dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 42444dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4245b2b2dd24SShri Abhyankar { 4246b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4247b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4248b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 4249b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4250b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4251b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4252b2b2dd24SShri Abhyankar PetscScalar *x; 4253b2b2dd24SShri Abhyankar const PetscScalar *b; 4254b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4255cee9d6f2SShri Abhyankar 4256b2b2dd24SShri Abhyankar PetscFunctionBegin; 42573649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 4258b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4259b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4260b2b2dd24SShri Abhyankar idx = 0; 4261b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4262b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4263b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4264b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4265b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4266b2b2dd24SShri Abhyankar idx = bs*i; 4267b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4268b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 4269b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4270b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4271b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4272b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4273b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4274b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4275b2b2dd24SShri Abhyankar 4276b2b2dd24SShri Abhyankar v += bs2; 4277b2b2dd24SShri Abhyankar } 4278b2b2dd24SShri Abhyankar 4279b2b2dd24SShri Abhyankar x[idx] = s1; 4280b2b2dd24SShri Abhyankar x[1+idx] = s2; 4281b2b2dd24SShri Abhyankar x[2+idx] = s3; 4282b2b2dd24SShri Abhyankar x[3+idx] = s4; 4283b2b2dd24SShri Abhyankar } 4284b2b2dd24SShri Abhyankar 4285b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4286b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4287b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4288b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4289b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4290b2b2dd24SShri Abhyankar idt = bs*i; 4291b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4292b2b2dd24SShri Abhyankar 4293b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4294b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4295b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4296b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4297b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4298b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4299b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4300b2b2dd24SShri Abhyankar 4301b2b2dd24SShri Abhyankar v += bs2; 4302b2b2dd24SShri Abhyankar } 4303b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4304b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4305b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4306b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4307b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4308b2b2dd24SShri Abhyankar 4309b2b2dd24SShri Abhyankar } 4310b2b2dd24SShri Abhyankar 43113649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 4312b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4313b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4314b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4315b2b2dd24SShri Abhyankar } 4316cee9d6f2SShri Abhyankar 4317cee9d6f2SShri Abhyankar #undef __FUNCT__ 4318f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4319dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4320f26ec98cSKris Buschelman { 4321f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4322b3260449SShri Abhyankar const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 4323dfbe8321SBarry Smith PetscErrorCode ierr; 4324b3260449SShri Abhyankar const MatScalar *aa=a->a; 4325b3260449SShri Abhyankar const PetscScalar *b; 4326b3260449SShri Abhyankar PetscScalar *x; 4327f26ec98cSKris Buschelman 4328f26ec98cSKris Buschelman PetscFunctionBegin; 43293649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 43301ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4331f26ec98cSKris Buschelman 4332f26ec98cSKris Buschelman { 4333f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4334b3260449SShri Abhyankar const MatScalar *v; 4335b3260449SShri Abhyankar MatScalar *t=(MatScalar *)x; 4336b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i,ai16; 4337b3260449SShri Abhyankar const PetscInt *vi; 4338f26ec98cSKris Buschelman 4339f26ec98cSKris Buschelman /* forward solve the lower triangular */ 4340f26ec98cSKris Buschelman idx = 0; 4341f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 4342f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 4343f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 4344f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 4345f26ec98cSKris Buschelman for (i=1; i<n; i++) { 4346f26ec98cSKris Buschelman v = aa + 16*ai[i]; 4347f26ec98cSKris Buschelman vi = aj + ai[i]; 4348f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 4349f26ec98cSKris Buschelman idx += 4; 4350f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 4351f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 4352f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 4353f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 4354f26ec98cSKris Buschelman while (nz--) { 4355f26ec98cSKris Buschelman jdx = 4*(*vi++); 4356f26ec98cSKris Buschelman x1 = t[jdx]; 4357f26ec98cSKris Buschelman x2 = t[1+jdx]; 4358f26ec98cSKris Buschelman x3 = t[2+jdx]; 4359f26ec98cSKris Buschelman x4 = t[3+jdx]; 4360f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4361f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4362f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4363f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4364f26ec98cSKris Buschelman v += 16; 4365f26ec98cSKris Buschelman } 4366f26ec98cSKris Buschelman t[idx] = s1; 4367f26ec98cSKris Buschelman t[1+idx] = s2; 4368f26ec98cSKris Buschelman t[2+idx] = s3; 4369f26ec98cSKris Buschelman t[3+idx] = s4; 4370f26ec98cSKris Buschelman } 4371f26ec98cSKris Buschelman /* backward solve the upper triangular */ 4372f26ec98cSKris Buschelman idt = 4*(n-1); 4373f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 4374f26ec98cSKris Buschelman ai16 = 16*diag[i]; 4375f26ec98cSKris Buschelman v = aa + ai16 + 16; 4376f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 4377f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 4378f26ec98cSKris Buschelman s1 = t[idt]; 4379f26ec98cSKris Buschelman s2 = t[1+idt]; 4380f26ec98cSKris Buschelman s3 = t[2+idt]; 4381f26ec98cSKris Buschelman s4 = t[3+idt]; 4382f26ec98cSKris Buschelman while (nz--) { 4383f26ec98cSKris Buschelman idx = 4*(*vi++); 4384f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 4385f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 4386f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 4387f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 4388f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4389f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4390f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4391f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4392f26ec98cSKris Buschelman v += 16; 4393f26ec98cSKris Buschelman } 4394f26ec98cSKris Buschelman v = aa + ai16; 4395f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4396f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4397f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4398f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4399f26ec98cSKris Buschelman idt -= 4; 4400f26ec98cSKris Buschelman } 4401f26ec98cSKris Buschelman } 4402f26ec98cSKris Buschelman 44033649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 44041ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4405dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4406f26ec98cSKris Buschelman PetscFunctionReturn(0); 4407f26ec98cSKris Buschelman } 4408f26ec98cSKris Buschelman 44093660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 44103660e330SKris Buschelman 44113660e330SKris Buschelman #include PETSC_HAVE_SSE 44123660e330SKris Buschelman #undef __FUNCT__ 44137cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4414dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 44153660e330SKris Buschelman { 44163660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 44172aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 4418dfbe8321SBarry Smith PetscErrorCode ierr; 4419dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 44203660e330SKris Buschelman MatScalar *aa=a->a; 442187828ca2SBarry Smith PetscScalar *x,*b; 44223660e330SKris Buschelman 44233660e330SKris Buschelman PetscFunctionBegin; 44243660e330SKris Buschelman SSE_SCOPE_BEGIN; 44253660e330SKris Buschelman /* 44263660e330SKris Buschelman Note: This code currently uses demotion of double 44273660e330SKris Buschelman to float when performing the mixed-mode computation. 44283660e330SKris Buschelman This may not be numerically reasonable for all applications. 44293660e330SKris Buschelman */ 44303660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 44313660e330SKris Buschelman 44321ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 44331ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 44343660e330SKris Buschelman { 4435eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 4436eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 44372aa5897fSKris Buschelman int nz,i,idt,ai16; 44382aa5897fSKris Buschelman unsigned int jdx,idx; 44392aa5897fSKris Buschelman unsigned short *vi; 4440eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 44413660e330SKris Buschelman 4442eb05f457SKris Buschelman /* First block is the identity. */ 44433660e330SKris Buschelman idx = 0; 4444eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 44452aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 44463660e330SKris Buschelman 44473660e330SKris Buschelman for (i=1; i<n;) { 44483660e330SKris Buschelman PREFETCH_NTA(&v[8]); 44493660e330SKris Buschelman vi = aj + ai[i]; 44503660e330SKris Buschelman nz = diag[i] - ai[i]; 44513660e330SKris Buschelman idx += 4; 44523660e330SKris Buschelman 4453eb05f457SKris Buschelman /* Demote RHS from double to float. */ 4454eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4455eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 44563660e330SKris Buschelman 44573660e330SKris Buschelman while (nz--) { 44583660e330SKris Buschelman PREFETCH_NTA(&v[16]); 44592aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 44603660e330SKris Buschelman 44613660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 4462eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 44633660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 44643660e330SKris Buschelman 44653660e330SKris Buschelman /* First Column */ 44663660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 44673660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 44683660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 44693660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 44703660e330SKris Buschelman 44713660e330SKris Buschelman /* Second Column */ 44723660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 44733660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 44743660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 44753660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 44763660e330SKris Buschelman 44773660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 44783660e330SKris Buschelman 44793660e330SKris Buschelman /* Third Column */ 44803660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 44813660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 44823660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 44833660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 44843660e330SKris Buschelman 44853660e330SKris Buschelman /* Fourth Column */ 44863660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 44873660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 44883660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 44893660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 44903660e330SKris Buschelman SSE_INLINE_END_2 44913660e330SKris Buschelman 44923660e330SKris Buschelman v += 16; 44933660e330SKris Buschelman } 44943660e330SKris Buschelman v = aa + 16*ai[++i]; 44953660e330SKris Buschelman PREFETCH_NTA(v); 4496eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 44973660e330SKris Buschelman } 4498eb05f457SKris Buschelman 4499eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 4500eb05f457SKris Buschelman 45013660e330SKris Buschelman idt = 4*(n-1); 45023660e330SKris Buschelman ai16 = 16*diag[n-1]; 45033660e330SKris Buschelman v = aa + ai16 + 16; 45043660e330SKris Buschelman for (i=n-1; i>=0;){ 45053660e330SKris Buschelman PREFETCH_NTA(&v[8]); 45063660e330SKris Buschelman vi = aj + diag[i] + 1; 45073660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 45083660e330SKris Buschelman 4509eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 45103660e330SKris Buschelman 45113660e330SKris Buschelman while (nz--) { 45123660e330SKris Buschelman PREFETCH_NTA(&v[16]); 45132aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 45143660e330SKris Buschelman 45153660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 4516eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 45173660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 45183660e330SKris Buschelman 45193660e330SKris Buschelman /* First Column */ 45203660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 45213660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 45223660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 45233660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 45243660e330SKris Buschelman 45253660e330SKris Buschelman /* Second Column */ 45263660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 45273660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 45283660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 45293660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 45303660e330SKris Buschelman 45313660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 45323660e330SKris Buschelman 45333660e330SKris Buschelman /* Third Column */ 45343660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 45353660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 45363660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 45373660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 45383660e330SKris Buschelman 45393660e330SKris Buschelman /* Fourth Column */ 45403660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 45413660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 45423660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 45433660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 45443660e330SKris Buschelman SSE_INLINE_END_2 45453660e330SKris Buschelman v += 16; 45463660e330SKris Buschelman } 45473660e330SKris Buschelman v = aa + ai16; 45483660e330SKris Buschelman ai16 = 16*diag[--i]; 45493660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 45503660e330SKris Buschelman /* 45513660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 45523660e330SKris Buschelman which was inverted as part of the factorization 45533660e330SKris Buschelman */ 4554eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 45553660e330SKris Buschelman /* First Column */ 45563660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 45573660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 45583660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 45593660e330SKris Buschelman 45603660e330SKris Buschelman /* Second Column */ 45613660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 45623660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 45633660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 45643660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 45653660e330SKris Buschelman 45663660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 45673660e330SKris Buschelman 45683660e330SKris Buschelman /* Third Column */ 45693660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 45703660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 45713660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 45723660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 45733660e330SKris Buschelman 45743660e330SKris Buschelman /* Fourth Column */ 45753660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 45763660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 45773660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 45783660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 45793660e330SKris Buschelman 45803660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 45813660e330SKris Buschelman SSE_INLINE_END_3 45823660e330SKris Buschelman 45833660e330SKris Buschelman v = aa + ai16 + 16; 45843660e330SKris Buschelman idt -= 4; 45853660e330SKris Buschelman } 4586eb05f457SKris Buschelman 4587eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 4588eb05f457SKris Buschelman idt = 4*(n-1); 4589eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 4590eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4591eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4592eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 4593eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 4594eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 4595eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 4596eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 4597eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 459854693613SKris Buschelman idt -= 4; 45993660e330SKris Buschelman } 4600eb05f457SKris Buschelman 4601eb05f457SKris Buschelman } /* End of artificial scope. */ 46021ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 46031ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4604dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 46053660e330SKris Buschelman SSE_SCOPE_END; 46063660e330SKris Buschelman PetscFunctionReturn(0); 46073660e330SKris Buschelman } 46083660e330SKris Buschelman 46097cf1b8d3SKris Buschelman #undef __FUNCT__ 46107cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4611dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 46127cf1b8d3SKris Buschelman { 46137cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 46147cf1b8d3SKris Buschelman int *aj=a->j; 4615dfbe8321SBarry Smith PetscErrorCode ierr; 4616dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 46177cf1b8d3SKris Buschelman MatScalar *aa=a->a; 46187cf1b8d3SKris Buschelman PetscScalar *x,*b; 46197cf1b8d3SKris Buschelman 46207cf1b8d3SKris Buschelman PetscFunctionBegin; 46217cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 46227cf1b8d3SKris Buschelman /* 46237cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 46247cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 46257cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 46267cf1b8d3SKris Buschelman */ 46277cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 46287cf1b8d3SKris Buschelman 46291ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 46301ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 46317cf1b8d3SKris Buschelman { 46327cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 46337cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 46347cf1b8d3SKris Buschelman int nz,i,idt,ai16; 46357cf1b8d3SKris Buschelman int jdx,idx; 46367cf1b8d3SKris Buschelman int *vi; 46377cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 46387cf1b8d3SKris Buschelman 46397cf1b8d3SKris Buschelman /* First block is the identity. */ 46407cf1b8d3SKris Buschelman idx = 0; 46417cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 46427cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 46437cf1b8d3SKris Buschelman 46447cf1b8d3SKris Buschelman for (i=1; i<n;) { 46457cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 46467cf1b8d3SKris Buschelman vi = aj + ai[i]; 46477cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 46487cf1b8d3SKris Buschelman idx += 4; 46497cf1b8d3SKris Buschelman 46507cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 46517cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 46527cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 46537cf1b8d3SKris Buschelman 46547cf1b8d3SKris Buschelman while (nz--) { 46557cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 46567cf1b8d3SKris Buschelman jdx = 4*(*vi++); 46577cf1b8d3SKris Buschelman /* jdx = *vi++; */ 46587cf1b8d3SKris Buschelman 46597cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 46607cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 46617cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 46627cf1b8d3SKris Buschelman 46637cf1b8d3SKris Buschelman /* First Column */ 46647cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 46657cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 46667cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 46677cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 46687cf1b8d3SKris Buschelman 46697cf1b8d3SKris Buschelman /* Second Column */ 46707cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 46717cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 46727cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 46737cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 46747cf1b8d3SKris Buschelman 46757cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 46767cf1b8d3SKris Buschelman 46777cf1b8d3SKris Buschelman /* Third Column */ 46787cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 46797cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 46807cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 46817cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 46827cf1b8d3SKris Buschelman 46837cf1b8d3SKris Buschelman /* Fourth Column */ 46847cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 46857cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 46867cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 46877cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 46887cf1b8d3SKris Buschelman SSE_INLINE_END_2 46897cf1b8d3SKris Buschelman 46907cf1b8d3SKris Buschelman v += 16; 46917cf1b8d3SKris Buschelman } 46927cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 46937cf1b8d3SKris Buschelman PREFETCH_NTA(v); 46947cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 46957cf1b8d3SKris Buschelman } 46967cf1b8d3SKris Buschelman 46977cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 46987cf1b8d3SKris Buschelman 46997cf1b8d3SKris Buschelman idt = 4*(n-1); 47007cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 47017cf1b8d3SKris Buschelman v = aa + ai16 + 16; 47027cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 47037cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 47047cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 47057cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 47067cf1b8d3SKris Buschelman 47077cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 47087cf1b8d3SKris Buschelman 47097cf1b8d3SKris Buschelman while (nz--) { 47107cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 47117cf1b8d3SKris Buschelman idx = 4*(*vi++); 47127cf1b8d3SKris Buschelman /* idx = *vi++; */ 47137cf1b8d3SKris Buschelman 47147cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 47157cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 47167cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 47177cf1b8d3SKris Buschelman 47187cf1b8d3SKris Buschelman /* First Column */ 47197cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 47207cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 47217cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 47227cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 47237cf1b8d3SKris Buschelman 47247cf1b8d3SKris Buschelman /* Second Column */ 47257cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 47267cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 47277cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 47287cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 47297cf1b8d3SKris Buschelman 47307cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 47317cf1b8d3SKris Buschelman 47327cf1b8d3SKris Buschelman /* Third Column */ 47337cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 47347cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 47357cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 47367cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 47377cf1b8d3SKris Buschelman 47387cf1b8d3SKris Buschelman /* Fourth Column */ 47397cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 47407cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 47417cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 47427cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 47437cf1b8d3SKris Buschelman SSE_INLINE_END_2 47447cf1b8d3SKris Buschelman v += 16; 47457cf1b8d3SKris Buschelman } 47467cf1b8d3SKris Buschelman v = aa + ai16; 47477cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 47487cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 47497cf1b8d3SKris Buschelman /* 47507cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 47517cf1b8d3SKris Buschelman which was inverted as part of the factorization 47527cf1b8d3SKris Buschelman */ 47537cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 47547cf1b8d3SKris Buschelman /* First Column */ 47557cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 47567cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 47577cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 47587cf1b8d3SKris Buschelman 47597cf1b8d3SKris Buschelman /* Second Column */ 47607cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 47617cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 47627cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 47637cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 47647cf1b8d3SKris Buschelman 47657cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 47667cf1b8d3SKris Buschelman 47677cf1b8d3SKris Buschelman /* Third Column */ 47687cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 47697cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 47707cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 47717cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 47727cf1b8d3SKris Buschelman 47737cf1b8d3SKris Buschelman /* Fourth Column */ 47747cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 47757cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 47767cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 47777cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 47787cf1b8d3SKris Buschelman 47797cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 47807cf1b8d3SKris Buschelman SSE_INLINE_END_3 47817cf1b8d3SKris Buschelman 47827cf1b8d3SKris Buschelman v = aa + ai16 + 16; 47837cf1b8d3SKris Buschelman idt -= 4; 47847cf1b8d3SKris Buschelman } 47857cf1b8d3SKris Buschelman 47867cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 47877cf1b8d3SKris Buschelman idt = 4*(n-1); 47887cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 47897cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 47907cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 47917cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 47927cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 47937cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 47947cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 47957cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 47967cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 47977cf1b8d3SKris Buschelman idt -= 4; 47987cf1b8d3SKris Buschelman } 47997cf1b8d3SKris Buschelman 48007cf1b8d3SKris Buschelman } /* End of artificial scope. */ 48011ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 48021ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4803dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 48047cf1b8d3SKris Buschelman SSE_SCOPE_END; 48057cf1b8d3SKris Buschelman PetscFunctionReturn(0); 48067cf1b8d3SKris Buschelman } 48077cf1b8d3SKris Buschelman 48083660e330SKris Buschelman #endif 48098f690400SShri Abhyankar 48104a2ae208SSatish Balay #undef __FUNCT__ 481106e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 481206e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 48134e2b4712SSatish Balay { 48144e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 48154e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 48166849ba73SBarry Smith PetscErrorCode ierr; 4817b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 4818b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 48195d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4820d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4821d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4822d9fead3dSBarry Smith const PetscScalar *b; 48234e2b4712SSatish Balay 48244e2b4712SSatish Balay PetscFunctionBegin; 48253649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 48261ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4827f1af5d2fSBarry Smith t = a->solve_work; 48284e2b4712SSatish Balay 48294e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 48304e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 48314e2b4712SSatish Balay 48324e2b4712SSatish Balay /* forward solve the lower triangular */ 48334e2b4712SSatish Balay idx = 3*(*r++); 4834f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 48354e2b4712SSatish Balay for (i=1; i<n; i++) { 48364e2b4712SSatish Balay v = aa + 9*ai[i]; 48374e2b4712SSatish Balay vi = aj + ai[i]; 48384e2b4712SSatish Balay nz = diag[i] - ai[i]; 48394e2b4712SSatish Balay idx = 3*(*r++); 4840f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 48414e2b4712SSatish Balay while (nz--) { 48424e2b4712SSatish Balay idx = 3*(*vi++); 4843f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4844f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4845f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4846f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48474e2b4712SSatish Balay v += 9; 48484e2b4712SSatish Balay } 48494e2b4712SSatish Balay idx = 3*i; 4850f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 48514e2b4712SSatish Balay } 48524e2b4712SSatish Balay /* backward solve the upper triangular */ 48534e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 48544e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 48554e2b4712SSatish Balay vi = aj + diag[i] + 1; 48564e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 48574e2b4712SSatish Balay idt = 3*i; 4858f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 48594e2b4712SSatish Balay while (nz--) { 48604e2b4712SSatish Balay idx = 3*(*vi++); 4861f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4862f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4863f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4864f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 48654e2b4712SSatish Balay v += 9; 48664e2b4712SSatish Balay } 48674e2b4712SSatish Balay idc = 3*(*c--); 48684e2b4712SSatish Balay v = aa + 9*diag[i]; 4869f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4870f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4871f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 48724e2b4712SSatish Balay } 48734e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 48744e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 48753649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 48761ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4877dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 48784e2b4712SSatish Balay PetscFunctionReturn(0); 48794e2b4712SSatish Balay } 48804e2b4712SSatish Balay 48810c4413a7SShri Abhyankar #undef __FUNCT__ 48824dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3" 48834dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 48840c4413a7SShri Abhyankar { 48850c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 48860c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 48870c4413a7SShri Abhyankar PetscErrorCode ierr; 4888b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4889b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc,m; 48900c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 48910c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 48920c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 48930c4413a7SShri Abhyankar const PetscScalar *b; 48940c4413a7SShri Abhyankar 48950c4413a7SShri Abhyankar PetscFunctionBegin; 48963649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 48970c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 48980c4413a7SShri Abhyankar t = a->solve_work; 48990c4413a7SShri Abhyankar 49000c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 49010c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 49020c4413a7SShri Abhyankar 49030c4413a7SShri Abhyankar /* forward solve the lower triangular */ 49040c4413a7SShri Abhyankar idx = 3*r[0]; 49050c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 49060c4413a7SShri Abhyankar for (i=1; i<n; i++) { 49070c4413a7SShri Abhyankar v = aa + 9*ai[i]; 49080c4413a7SShri Abhyankar vi = aj + ai[i]; 49090c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 49100c4413a7SShri Abhyankar idx = 3*r[i]; 49110c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 49120c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 49130c4413a7SShri Abhyankar idx = 3*vi[m]; 49140c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 49150c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 49160c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 49170c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 49180c4413a7SShri Abhyankar v += 9; 49190c4413a7SShri Abhyankar } 49200c4413a7SShri Abhyankar idx = 3*i; 49210c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 49220c4413a7SShri Abhyankar } 49230c4413a7SShri Abhyankar /* backward solve the upper triangular */ 49240c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 49250c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 49260c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 49270c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 49280c4413a7SShri Abhyankar idt = 3*i; 49290c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 49300c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 49310c4413a7SShri Abhyankar idx = 3*vi[m]; 49320c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 49330c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 49340c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 49350c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 49360c4413a7SShri Abhyankar v += 9; 49370c4413a7SShri Abhyankar } 49380c4413a7SShri Abhyankar idc = 3*c[i]; 49390c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 49400c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 49410c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 49420c4413a7SShri Abhyankar } 49430c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 49440c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 49453649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 49460c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 49470c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 49480c4413a7SShri Abhyankar PetscFunctionReturn(0); 49490c4413a7SShri Abhyankar } 49500c4413a7SShri Abhyankar 495115091d37SBarry Smith /* 495215091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 495315091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 495415091d37SBarry Smith */ 49554a2ae208SSatish Balay #undef __FUNCT__ 495606e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 495706e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 495815091d37SBarry Smith { 495915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 49600b68f018SBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4961dfbe8321SBarry Smith PetscErrorCode ierr; 49620b68f018SBarry Smith const PetscInt *diag = a->diag,*vi; 4963d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4964d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 4965d9fead3dSBarry Smith const PetscScalar *b; 49660b68f018SBarry Smith PetscInt jdx,idt,idx,nz,i; 496715091d37SBarry Smith 496815091d37SBarry Smith PetscFunctionBegin; 49693649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 49701ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 497115091d37SBarry Smith 497215091d37SBarry Smith /* forward solve the lower triangular */ 497315091d37SBarry Smith idx = 0; 497415091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 497515091d37SBarry Smith for (i=1; i<n; i++) { 497615091d37SBarry Smith v = aa + 9*ai[i]; 497715091d37SBarry Smith vi = aj + ai[i]; 497815091d37SBarry Smith nz = diag[i] - ai[i]; 497915091d37SBarry Smith idx += 3; 4980f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 498115091d37SBarry Smith while (nz--) { 498215091d37SBarry Smith jdx = 3*(*vi++); 498315091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4984f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4985f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4986f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 498715091d37SBarry Smith v += 9; 498815091d37SBarry Smith } 4989f1af5d2fSBarry Smith x[idx] = s1; 4990f1af5d2fSBarry Smith x[1+idx] = s2; 4991f1af5d2fSBarry Smith x[2+idx] = s3; 499215091d37SBarry Smith } 499315091d37SBarry Smith /* backward solve the upper triangular */ 499415091d37SBarry Smith for (i=n-1; i>=0; i--){ 499515091d37SBarry Smith v = aa + 9*diag[i] + 9; 499615091d37SBarry Smith vi = aj + diag[i] + 1; 499715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 499815091d37SBarry Smith idt = 3*i; 4999f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 5000f1af5d2fSBarry Smith s3 = x[2+idt]; 500115091d37SBarry Smith while (nz--) { 500215091d37SBarry Smith idx = 3*(*vi++); 500315091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 5004f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5005f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5006f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 500715091d37SBarry Smith v += 9; 500815091d37SBarry Smith } 500915091d37SBarry Smith v = aa + 9*diag[i]; 5010f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5011f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5012f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 501315091d37SBarry Smith } 501415091d37SBarry Smith 50153649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 50161ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5017dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 501815091d37SBarry Smith PetscFunctionReturn(0); 501915091d37SBarry Smith } 502015091d37SBarry Smith 5021cee9d6f2SShri Abhyankar #undef __FUNCT__ 50224dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 50234dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 5024b2b2dd24SShri Abhyankar { 5025b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5026b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5027b2b2dd24SShri Abhyankar PetscErrorCode ierr; 5028b3260449SShri Abhyankar PetscInt i,k,nz,idx,jdx,idt; 5029b3260449SShri Abhyankar const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 5030b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 5031b2b2dd24SShri Abhyankar PetscScalar *x; 5032b2b2dd24SShri Abhyankar const PetscScalar *b; 5033b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 5034b2b2dd24SShri Abhyankar 5035b2b2dd24SShri Abhyankar PetscFunctionBegin; 50363649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5037b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5038b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 5039b2b2dd24SShri Abhyankar idx = 0; 5040b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 5041b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 5042b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 5043b2b2dd24SShri Abhyankar vi = aj + ai[i]; 5044b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 5045b2b2dd24SShri Abhyankar idx = bs*i; 5046b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 5047b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 5048b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 5049b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 5050b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5051b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5052b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5053b2b2dd24SShri Abhyankar 5054b2b2dd24SShri Abhyankar v += bs2; 5055b2b2dd24SShri Abhyankar } 5056b2b2dd24SShri Abhyankar 5057b2b2dd24SShri Abhyankar x[idx] = s1; 5058b2b2dd24SShri Abhyankar x[1+idx] = s2; 5059b2b2dd24SShri Abhyankar x[2+idx] = s3; 5060b2b2dd24SShri Abhyankar } 5061b2b2dd24SShri Abhyankar 5062b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 5063b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 5064b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 5065b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 5066b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5067b2b2dd24SShri Abhyankar idt = bs*i; 5068b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 5069b2b2dd24SShri Abhyankar 5070b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 5071b2b2dd24SShri Abhyankar idx = bs*vi[k]; 5072b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 5073b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 5074b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 5075b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 5076b2b2dd24SShri Abhyankar 5077b2b2dd24SShri Abhyankar v += bs2; 5078b2b2dd24SShri Abhyankar } 5079b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 5080b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 5081b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 5082b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 5083b2b2dd24SShri Abhyankar 5084b2b2dd24SShri Abhyankar } 5085b2b2dd24SShri Abhyankar 50863649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5087b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5088b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 5089b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 5090b2b2dd24SShri Abhyankar } 5091b2b2dd24SShri Abhyankar 5092b2b2dd24SShri Abhyankar #undef __FUNCT__ 509306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 509406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 50954e2b4712SSatish Balay { 50964e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 50974e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 50986849ba73SBarry Smith PetscErrorCode ierr; 5099b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5100b3260449SShri Abhyankar PetscInt i,nz,idx,idt,idc; 51015d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5102d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 5103d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 5104d9fead3dSBarry Smith const PetscScalar *b; 51054e2b4712SSatish Balay 51064e2b4712SSatish Balay PetscFunctionBegin; 51073649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 51081ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5109f1af5d2fSBarry Smith t = a->solve_work; 51104e2b4712SSatish Balay 51114e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 51124e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 51134e2b4712SSatish Balay 51144e2b4712SSatish Balay /* forward solve the lower triangular */ 51154e2b4712SSatish Balay idx = 2*(*r++); 5116f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 51174e2b4712SSatish Balay for (i=1; i<n; i++) { 51184e2b4712SSatish Balay v = aa + 4*ai[i]; 51194e2b4712SSatish Balay vi = aj + ai[i]; 51204e2b4712SSatish Balay nz = diag[i] - ai[i]; 51214e2b4712SSatish Balay idx = 2*(*r++); 5122f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 51234e2b4712SSatish Balay while (nz--) { 51244e2b4712SSatish Balay idx = 2*(*vi++); 5125f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 5126f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5127f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 51284e2b4712SSatish Balay v += 4; 51294e2b4712SSatish Balay } 51304e2b4712SSatish Balay idx = 2*i; 5131f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 51324e2b4712SSatish Balay } 51334e2b4712SSatish Balay /* backward solve the upper triangular */ 51344e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 51354e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 51364e2b4712SSatish Balay vi = aj + diag[i] + 1; 51374e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 51384e2b4712SSatish Balay idt = 2*i; 5139f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 51404e2b4712SSatish Balay while (nz--) { 51414e2b4712SSatish Balay idx = 2*(*vi++); 5142f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 5143f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5144f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 51454e2b4712SSatish Balay v += 4; 51464e2b4712SSatish Balay } 51474e2b4712SSatish Balay idc = 2*(*c--); 51484e2b4712SSatish Balay v = aa + 4*diag[i]; 5149f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5150f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 51514e2b4712SSatish Balay } 51524e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 51534e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 51543649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 51551ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5156dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 51574e2b4712SSatish Balay PetscFunctionReturn(0); 51584e2b4712SSatish Balay } 51594e2b4712SSatish Balay 51600c4413a7SShri Abhyankar #undef __FUNCT__ 51614dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2" 51624dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 51630c4413a7SShri Abhyankar { 51640c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 51650c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 51660c4413a7SShri Abhyankar PetscErrorCode ierr; 5167b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5168b3260449SShri Abhyankar PetscInt i,nz,idx,jdx,idt,idc,m; 51690c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 51700c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 51710c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 51720c4413a7SShri Abhyankar const PetscScalar *b; 51730c4413a7SShri Abhyankar 51740c4413a7SShri Abhyankar PetscFunctionBegin; 51753649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 51760c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 51770c4413a7SShri Abhyankar t = a->solve_work; 51780c4413a7SShri Abhyankar 51790c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 51800c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 51810c4413a7SShri Abhyankar 51820c4413a7SShri Abhyankar /* forward solve the lower triangular */ 51830c4413a7SShri Abhyankar idx = 2*r[0]; 51840c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 51850c4413a7SShri Abhyankar for (i=1; i<n; i++) { 51860c4413a7SShri Abhyankar v = aa + 4*ai[i]; 51870c4413a7SShri Abhyankar vi = aj + ai[i]; 51880c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 51890c4413a7SShri Abhyankar idx = 2*r[i]; 51900c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 51910c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 51920c4413a7SShri Abhyankar jdx = 2*vi[m]; 51930c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 51940c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 51950c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 51960c4413a7SShri Abhyankar v += 4; 51970c4413a7SShri Abhyankar } 51980c4413a7SShri Abhyankar idx = 2*i; 51990c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 52000c4413a7SShri Abhyankar } 52010c4413a7SShri Abhyankar /* backward solve the upper triangular */ 52020c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 52030c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 52040c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 52050c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 52060c4413a7SShri Abhyankar idt = 2*i; 52070c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 52080c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 52090c4413a7SShri Abhyankar idx = 2*vi[m]; 52100c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 52110c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 52120c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 52130c4413a7SShri Abhyankar v += 4; 52140c4413a7SShri Abhyankar } 52150c4413a7SShri Abhyankar idc = 2*c[i]; 52160c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 52170c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 52180c4413a7SShri Abhyankar } 52190c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 52200c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 52213649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 52220c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52230c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 52240c4413a7SShri Abhyankar PetscFunctionReturn(0); 52250c4413a7SShri Abhyankar } 52268f690400SShri Abhyankar 522715091d37SBarry Smith /* 522815091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 522915091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 523015091d37SBarry Smith */ 52314a2ae208SSatish Balay #undef __FUNCT__ 523206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 523306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 523415091d37SBarry Smith { 523515091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5236b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5237dfbe8321SBarry Smith PetscErrorCode ierr; 5238d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 5239d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 5240d9fead3dSBarry Smith const PetscScalar *b; 5241b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i; 524215091d37SBarry Smith 524315091d37SBarry Smith PetscFunctionBegin; 52443649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 52451ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 524615091d37SBarry Smith 524715091d37SBarry Smith /* forward solve the lower triangular */ 524815091d37SBarry Smith idx = 0; 524915091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 525015091d37SBarry Smith for (i=1; i<n; i++) { 525115091d37SBarry Smith v = aa + 4*ai[i]; 525215091d37SBarry Smith vi = aj + ai[i]; 525315091d37SBarry Smith nz = diag[i] - ai[i]; 525415091d37SBarry Smith idx += 2; 5255f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 525615091d37SBarry Smith while (nz--) { 525715091d37SBarry Smith jdx = 2*(*vi++); 525815091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 5259f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5260f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 526115091d37SBarry Smith v += 4; 526215091d37SBarry Smith } 5263f1af5d2fSBarry Smith x[idx] = s1; 5264f1af5d2fSBarry Smith x[1+idx] = s2; 526515091d37SBarry Smith } 526615091d37SBarry Smith /* backward solve the upper triangular */ 526715091d37SBarry Smith for (i=n-1; i>=0; i--){ 526815091d37SBarry Smith v = aa + 4*diag[i] + 4; 526915091d37SBarry Smith vi = aj + diag[i] + 1; 527015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 527115091d37SBarry Smith idt = 2*i; 5272f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 527315091d37SBarry Smith while (nz--) { 527415091d37SBarry Smith idx = 2*(*vi++); 527515091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 5276f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 5277f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 527815091d37SBarry Smith v += 4; 527915091d37SBarry Smith } 528015091d37SBarry Smith v = aa + 4*diag[i]; 5281f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 5282f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 528315091d37SBarry Smith } 528415091d37SBarry Smith 52853649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 52861ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5287dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 528815091d37SBarry Smith PetscFunctionReturn(0); 528915091d37SBarry Smith } 529015091d37SBarry Smith 5291cee9d6f2SShri Abhyankar #undef __FUNCT__ 52924dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 52934dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5294b2b2dd24SShri Abhyankar { 5295b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5296b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5297b3260449SShri Abhyankar PetscInt i,k,nz,idx,idt,jdx; 5298b2b2dd24SShri Abhyankar PetscErrorCode ierr; 5299b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 5300b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 5301b2b2dd24SShri Abhyankar const PetscScalar *b; 5302b2b2dd24SShri Abhyankar 5303b2b2dd24SShri Abhyankar PetscFunctionBegin; 53043649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5305b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5306b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 5307b2b2dd24SShri Abhyankar idx = 0; 5308b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 5309b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 5310b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 5311b2b2dd24SShri Abhyankar vi = aj + ai[i]; 5312b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 5313b2b2dd24SShri Abhyankar idx = 2*i; 5314b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 5315*4c0dbd8dSJed Brown PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA); 5316*4c0dbd8dSJed Brown PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); 5317b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 5318b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 5319b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 5320b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 5321b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 5322b2b2dd24SShri Abhyankar v += 4; 5323b2b2dd24SShri Abhyankar } 5324b2b2dd24SShri Abhyankar x[idx] = s1; 5325b2b2dd24SShri Abhyankar x[1+idx] = s2; 5326b2b2dd24SShri Abhyankar } 5327b2b2dd24SShri Abhyankar 5328b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 5329b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 5330b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 5331b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 5332b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5333b2b2dd24SShri Abhyankar idt = 2*i; 5334b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 5335*4c0dbd8dSJed Brown PetscPrefetchBlock(vi+nz,nz,0,PETSC_PREFETCH_HINT_NTA); 5336*4c0dbd8dSJed Brown PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); 5337b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 5338b2b2dd24SShri Abhyankar idx = 2*vi[k]; 5339b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 5340b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 5341b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 5342b2b2dd24SShri Abhyankar v += 4; 5343b2b2dd24SShri Abhyankar } 5344b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 5345b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 5346b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 5347b2b2dd24SShri Abhyankar } 5348b2b2dd24SShri Abhyankar 53493649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5350b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5351b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5352b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 5353b2b2dd24SShri Abhyankar } 5354b2b2dd24SShri Abhyankar 5355b2b2dd24SShri Abhyankar #undef __FUNCT__ 535606e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 535706e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 53584e2b4712SSatish Balay { 53594e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 53604e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 53616849ba73SBarry Smith PetscErrorCode ierr; 5362b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5363b3260449SShri Abhyankar PetscInt i,nz; 53645d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5365b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 5366b3260449SShri Abhyankar PetscScalar *x,s1,*t; 5367b3260449SShri Abhyankar const PetscScalar *b; 53684e2b4712SSatish Balay 53694e2b4712SSatish Balay PetscFunctionBegin; 53704e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 53714e2b4712SSatish Balay 53723649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 53731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5374f1af5d2fSBarry Smith t = a->solve_work; 53754e2b4712SSatish Balay 53764e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 53774e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 53784e2b4712SSatish Balay 53794e2b4712SSatish Balay /* forward solve the lower triangular */ 5380f1af5d2fSBarry Smith t[0] = b[*r++]; 53814e2b4712SSatish Balay for (i=1; i<n; i++) { 53824e2b4712SSatish Balay v = aa + ai[i]; 53834e2b4712SSatish Balay vi = aj + ai[i]; 53844e2b4712SSatish Balay nz = diag[i] - ai[i]; 5385f1af5d2fSBarry Smith s1 = b[*r++]; 53864e2b4712SSatish Balay while (nz--) { 5387f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 53884e2b4712SSatish Balay } 5389f1af5d2fSBarry Smith t[i] = s1; 53904e2b4712SSatish Balay } 53914e2b4712SSatish Balay /* backward solve the upper triangular */ 53924e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 53934e2b4712SSatish Balay v = aa + diag[i] + 1; 53944e2b4712SSatish Balay vi = aj + diag[i] + 1; 53954e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 5396f1af5d2fSBarry Smith s1 = t[i]; 53974e2b4712SSatish Balay while (nz--) { 5398f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 53994e2b4712SSatish Balay } 5400f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 54014e2b4712SSatish Balay } 54024e2b4712SSatish Balay 54034e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 54044e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 54053649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 54061ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5407dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 54084e2b4712SSatish Balay PetscFunctionReturn(0); 54094e2b4712SSatish Balay } 5410048b5e81SShri Abhyankar 5411048b5e81SShri Abhyankar #undef __FUNCT__ 5412048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1" 5413048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 5414048b5e81SShri Abhyankar { 5415048b5e81SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5416048b5e81SShri Abhyankar IS iscol = a->col,isrow = a->row; 5417048b5e81SShri Abhyankar PetscErrorCode ierr; 5418048b5e81SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz; 5419048b5e81SShri Abhyankar const PetscInt *rout,*cout,*r,*c; 5420048b5e81SShri Abhyankar PetscScalar *x,*tmp,sum; 5421048b5e81SShri Abhyankar const PetscScalar *b; 5422048b5e81SShri Abhyankar const MatScalar *aa = a->a,*v; 5423048b5e81SShri Abhyankar 5424048b5e81SShri Abhyankar PetscFunctionBegin; 5425048b5e81SShri Abhyankar if (!n) PetscFunctionReturn(0); 5426048b5e81SShri Abhyankar 54273649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5428048b5e81SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5429048b5e81SShri Abhyankar tmp = a->solve_work; 5430048b5e81SShri Abhyankar 5431048b5e81SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5432048b5e81SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5433048b5e81SShri Abhyankar 5434048b5e81SShri Abhyankar /* forward solve the lower triangular */ 5435048b5e81SShri Abhyankar tmp[0] = b[r[0]]; 5436048b5e81SShri Abhyankar v = aa; 5437048b5e81SShri Abhyankar vi = aj; 5438048b5e81SShri Abhyankar for (i=1; i<n; i++) { 5439048b5e81SShri Abhyankar nz = ai[i+1] - ai[i]; 5440048b5e81SShri Abhyankar sum = b[r[i]]; 5441048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5442048b5e81SShri Abhyankar tmp[i] = sum; 5443048b5e81SShri Abhyankar v += nz; vi += nz; 5444048b5e81SShri Abhyankar } 5445048b5e81SShri Abhyankar 5446048b5e81SShri Abhyankar /* backward solve the upper triangular */ 5447048b5e81SShri Abhyankar for (i=n-1; i>=0; i--){ 5448048b5e81SShri Abhyankar v = aa + adiag[i+1]+1; 5449048b5e81SShri Abhyankar vi = aj + adiag[i+1]+1; 5450048b5e81SShri Abhyankar nz = adiag[i]-adiag[i+1]-1; 5451048b5e81SShri Abhyankar sum = tmp[i]; 5452048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,tmp,v,vi,nz); 5453048b5e81SShri Abhyankar x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */ 5454048b5e81SShri Abhyankar } 5455048b5e81SShri Abhyankar 5456048b5e81SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5457048b5e81SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 54583649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5459048b5e81SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5460048b5e81SShri Abhyankar ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr); 5461048b5e81SShri Abhyankar PetscFunctionReturn(0); 5462048b5e81SShri Abhyankar } 5463048b5e81SShri Abhyankar 546415091d37SBarry Smith /* 546515091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 546615091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 546715091d37SBarry Smith */ 54684a2ae208SSatish Balay #undef __FUNCT__ 546906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 547006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 547115091d37SBarry Smith { 547215091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5473b3260449SShri Abhyankar const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5474dfbe8321SBarry Smith PetscErrorCode ierr; 5475b3260449SShri Abhyankar const MatScalar *aa=a->a,*v; 5476b3260449SShri Abhyankar PetscScalar *x; 5477b3260449SShri Abhyankar const PetscScalar *b; 547887828ca2SBarry Smith PetscScalar s1,x1; 5479b3260449SShri Abhyankar PetscInt jdx,idt,idx,nz,i; 548015091d37SBarry Smith 548115091d37SBarry Smith PetscFunctionBegin; 54823649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 54831ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 548415091d37SBarry Smith 548515091d37SBarry Smith /* forward solve the lower triangular */ 548615091d37SBarry Smith idx = 0; 548715091d37SBarry Smith x[0] = b[0]; 548815091d37SBarry Smith for (i=1; i<n; i++) { 548915091d37SBarry Smith v = aa + ai[i]; 549015091d37SBarry Smith vi = aj + ai[i]; 549115091d37SBarry Smith nz = diag[i] - ai[i]; 549215091d37SBarry Smith idx += 1; 5493f1af5d2fSBarry Smith s1 = b[idx]; 549415091d37SBarry Smith while (nz--) { 549515091d37SBarry Smith jdx = *vi++; 549615091d37SBarry Smith x1 = x[jdx]; 5497f1af5d2fSBarry Smith s1 -= v[0]*x1; 549815091d37SBarry Smith v += 1; 549915091d37SBarry Smith } 5500f1af5d2fSBarry Smith x[idx] = s1; 550115091d37SBarry Smith } 550215091d37SBarry Smith /* backward solve the upper triangular */ 550315091d37SBarry Smith for (i=n-1; i>=0; i--){ 550415091d37SBarry Smith v = aa + diag[i] + 1; 550515091d37SBarry Smith vi = aj + diag[i] + 1; 550615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 550715091d37SBarry Smith idt = i; 5508f1af5d2fSBarry Smith s1 = x[idt]; 550915091d37SBarry Smith while (nz--) { 551015091d37SBarry Smith idx = *vi++; 551115091d37SBarry Smith x1 = x[idx]; 5512f1af5d2fSBarry Smith s1 -= v[0]*x1; 551315091d37SBarry Smith v += 1; 551415091d37SBarry Smith } 551515091d37SBarry Smith v = aa + diag[i]; 5516f1af5d2fSBarry Smith x[idt] = v[0]*s1; 551715091d37SBarry Smith } 55183649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 55191ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5520dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 552115091d37SBarry Smith PetscFunctionReturn(0); 552215091d37SBarry Smith } 55234e2b4712SSatish Balay 5524048b5e81SShri Abhyankar 5525048b5e81SShri Abhyankar #undef __FUNCT__ 5526048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5527048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 5528048b5e81SShri Abhyankar { 5529048b5e81SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 5530048b5e81SShri Abhyankar PetscErrorCode ierr; 5531048b5e81SShri Abhyankar const PetscInt n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi; 5532048b5e81SShri Abhyankar PetscScalar *x,sum; 5533048b5e81SShri Abhyankar const PetscScalar *b; 5534048b5e81SShri Abhyankar const MatScalar *aa = a->a,*v; 5535048b5e81SShri Abhyankar PetscInt i,nz; 5536048b5e81SShri Abhyankar 5537048b5e81SShri Abhyankar PetscFunctionBegin; 5538048b5e81SShri Abhyankar if (!n) PetscFunctionReturn(0); 5539048b5e81SShri Abhyankar 55403649974fSBarry Smith ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr); 5541048b5e81SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5542048b5e81SShri Abhyankar 5543048b5e81SShri Abhyankar /* forward solve the lower triangular */ 5544048b5e81SShri Abhyankar x[0] = b[0]; 5545048b5e81SShri Abhyankar v = aa; 5546048b5e81SShri Abhyankar vi = aj; 5547048b5e81SShri Abhyankar for (i=1; i<n; i++) { 5548048b5e81SShri Abhyankar nz = ai[i+1] - ai[i]; 5549048b5e81SShri Abhyankar sum = b[i]; 5550048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5551048b5e81SShri Abhyankar v += nz; 5552048b5e81SShri Abhyankar vi += nz; 5553048b5e81SShri Abhyankar x[i] = sum; 5554048b5e81SShri Abhyankar } 5555048b5e81SShri Abhyankar 5556048b5e81SShri Abhyankar /* backward solve the upper triangular */ 5557048b5e81SShri Abhyankar for (i=n-1; i>=0; i--){ 5558048b5e81SShri Abhyankar v = aa + adiag[i+1] + 1; 5559048b5e81SShri Abhyankar vi = aj + adiag[i+1] + 1; 5560048b5e81SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 5561048b5e81SShri Abhyankar sum = x[i]; 5562048b5e81SShri Abhyankar PetscSparseDenseMinusDot(sum,x,v,vi,nz); 5563048b5e81SShri Abhyankar x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */ 5564048b5e81SShri Abhyankar } 5565048b5e81SShri Abhyankar 5566048b5e81SShri Abhyankar ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr); 55673649974fSBarry Smith ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr); 5568048b5e81SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5569048b5e81SShri Abhyankar PetscFunctionReturn(0); 5570048b5e81SShri Abhyankar } 5571048b5e81SShri Abhyankar 55724e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 557309573ac7SBarry Smith extern PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscBool ); 55746bce7ff8SHong Zhang 55752b0b2ea7SShri Abhyankar #undef __FUNCT__ 557629a97285SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering" 5577766f9fbaSBarry Smith /* 5578766f9fbaSBarry Smith This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 5579766f9fbaSBarry Smith */ 558029a97285SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 55812b0b2ea7SShri Abhyankar { 55822b0b2ea7SShri Abhyankar Mat C=B; 55832b0b2ea7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 55842b0b2ea7SShri Abhyankar PetscErrorCode ierr; 5585766f9fbaSBarry Smith PetscInt i,j,k,ipvt[15]; 5586766f9fbaSBarry Smith const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj; 5587766f9fbaSBarry Smith PetscInt nz,nzL,row; 5588766f9fbaSBarry Smith MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225]; 5589766f9fbaSBarry Smith const MatScalar *v,*aa=a->a; 55902b0b2ea7SShri Abhyankar PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 55910fa040f9SShri Abhyankar PetscInt sol_ver; 55922b0b2ea7SShri Abhyankar 55932b0b2ea7SShri Abhyankar PetscFunctionBegin; 55942b0b2ea7SShri Abhyankar 55950fa040f9SShri Abhyankar ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr); 55960fa040f9SShri Abhyankar 55972b0b2ea7SShri Abhyankar /* generate work space needed by the factorization */ 55982b0b2ea7SShri Abhyankar ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 55992b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 56002b0b2ea7SShri Abhyankar 56012b0b2ea7SShri Abhyankar for (i=0; i<n; i++){ 56022b0b2ea7SShri Abhyankar /* zero rtmp */ 56032b0b2ea7SShri Abhyankar /* L part */ 56042b0b2ea7SShri Abhyankar nz = bi[i+1] - bi[i]; 56052b0b2ea7SShri Abhyankar bjtmp = bj + bi[i]; 56062b0b2ea7SShri Abhyankar for (j=0; j<nz; j++){ 56072b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56082b0b2ea7SShri Abhyankar } 56092b0b2ea7SShri Abhyankar 56102b0b2ea7SShri Abhyankar /* U part */ 56112b0b2ea7SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 56122b0b2ea7SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 56132b0b2ea7SShri Abhyankar for (j=0; j<nz; j++){ 56142b0b2ea7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56152b0b2ea7SShri Abhyankar } 56162b0b2ea7SShri Abhyankar 56172b0b2ea7SShri Abhyankar /* load in initial (unfactored row) */ 561829a97285SShri Abhyankar nz = ai[i+1] - ai[i]; 561929a97285SShri Abhyankar ajtmp = aj + ai[i]; 562029a97285SShri Abhyankar v = aa + bs2*ai[i]; 56212b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 562229a97285SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 56232b0b2ea7SShri Abhyankar } 56242b0b2ea7SShri Abhyankar 56252b0b2ea7SShri Abhyankar /* elimination */ 56262b0b2ea7SShri Abhyankar bjtmp = bj + bi[i]; 56272b0b2ea7SShri Abhyankar nzL = bi[i+1] - bi[i]; 56282b0b2ea7SShri Abhyankar for(k=0;k < nzL;k++) { 56292b0b2ea7SShri Abhyankar row = bjtmp[k]; 56302b0b2ea7SShri Abhyankar pc = rtmp + bs2*row; 56312b0b2ea7SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 56322b0b2ea7SShri Abhyankar if (flg) { 56332b0b2ea7SShri Abhyankar pv = b->a + bs2*bdiag[row]; 5634766f9fbaSBarry Smith Kernel_A_gets_A_times_B(bs,pc,pv,mwork); 5635766f9fbaSBarry Smith /*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/ 56362b0b2ea7SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 56372b0b2ea7SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 56382b0b2ea7SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 56392b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 5640766f9fbaSBarry Smith vv = rtmp + bs2*pj[j]; 5641766f9fbaSBarry Smith Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv); 5642766f9fbaSBarry Smith /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */ 56432b0b2ea7SShri Abhyankar pv += bs2; 56442b0b2ea7SShri Abhyankar } 5645766f9fbaSBarry Smith ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 56462b0b2ea7SShri Abhyankar } 56472b0b2ea7SShri Abhyankar } 56482b0b2ea7SShri Abhyankar 56492b0b2ea7SShri Abhyankar /* finished row so stick it into b->a */ 56502b0b2ea7SShri Abhyankar /* L part */ 56512b0b2ea7SShri Abhyankar pv = b->a + bs2*bi[i] ; 56522b0b2ea7SShri Abhyankar pj = b->j + bi[i] ; 56532b0b2ea7SShri Abhyankar nz = bi[i+1] - bi[i]; 56542b0b2ea7SShri Abhyankar for (j=0; j<nz; j++) { 56552b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56562b0b2ea7SShri Abhyankar } 56572b0b2ea7SShri Abhyankar 56582b0b2ea7SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 56592b0b2ea7SShri Abhyankar pv = b->a + bs2*bdiag[i]; 56602b0b2ea7SShri Abhyankar pj = b->j + bdiag[i]; 56612b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5662766f9fbaSBarry Smith /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */ 5663182b8fbaSHong Zhang ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr); 56642b0b2ea7SShri Abhyankar 56652b0b2ea7SShri Abhyankar /* U part */ 56662b0b2ea7SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 56672b0b2ea7SShri Abhyankar pj = b->j + bdiag[i+1]+1; 56682b0b2ea7SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 56692b0b2ea7SShri Abhyankar for (j=0; j<nz; j++){ 56702b0b2ea7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 56712b0b2ea7SShri Abhyankar } 56722b0b2ea7SShri Abhyankar } 56732b0b2ea7SShri Abhyankar 56742b0b2ea7SShri Abhyankar ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5675832cc040SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1; 5676766f9fbaSBarry Smith C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 56772b0b2ea7SShri Abhyankar C->assembled = PETSC_TRUE; 5678766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 56792b0b2ea7SShri Abhyankar PetscFunctionReturn(0); 56802b0b2ea7SShri Abhyankar } 56812b0b2ea7SShri Abhyankar 56826bce7ff8SHong Zhang #undef __FUNCT__ 56834dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 56844dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 56856bce7ff8SHong Zhang { 56866bce7ff8SHong Zhang Mat C=B; 56876bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 56886bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 56896bce7ff8SHong Zhang PetscErrorCode ierr; 56905a586d82SBarry Smith const PetscInt *r,*ic; 56916bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 56926bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5693b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5694914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5695914a18a2SHong Zhang MatScalar *v_work; 5696ace3abfcSBarry Smith PetscBool col_identity,row_identity,both_identity; 56976bce7ff8SHong Zhang 56986bce7ff8SHong Zhang PetscFunctionBegin; 56996bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 57006bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5701ae3d28f0SHong Zhang 5702fca92195SBarry Smith ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5703fca92195SBarry Smith ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 57046bce7ff8SHong Zhang 5705914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 5706fca92195SBarry Smith ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5707914a18a2SHong Zhang 57086bce7ff8SHong Zhang for (i=0; i<n; i++){ 57096bce7ff8SHong Zhang /* zero rtmp */ 57106bce7ff8SHong Zhang /* L part */ 57116bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 57126bce7ff8SHong Zhang bjtmp = bj + bi[i]; 5713914a18a2SHong Zhang for (j=0; j<nz; j++){ 5714914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5715914a18a2SHong Zhang } 57166bce7ff8SHong Zhang 57176bce7ff8SHong Zhang /* U part */ 57181a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 57191a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 57201a83e813SShri Abhyankar for (j=0; j<nz; j++){ 57211a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 57221a83e813SShri Abhyankar } 57231a83e813SShri Abhyankar 57241a83e813SShri Abhyankar /* load in initial (unfactored row) */ 57251a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 57261a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 57271a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 57281a83e813SShri Abhyankar for (j=0; j<nz; j++) { 57291a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 57301a83e813SShri Abhyankar } 57311a83e813SShri Abhyankar 57321a83e813SShri Abhyankar /* elimination */ 57331a83e813SShri Abhyankar bjtmp = bj + bi[i]; 57341a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 57351a83e813SShri Abhyankar for(k=0;k < nzL;k++) { 57361a83e813SShri Abhyankar row = bjtmp[k]; 57371a83e813SShri Abhyankar pc = rtmp + bs2*row; 57381a83e813SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 57391a83e813SShri Abhyankar if (flg) { 57401a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 57411a83e813SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 57421a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 57431a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 57441a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 57451a83e813SShri Abhyankar for (j=0; j<nz; j++) { 57461a83e813SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 57471a83e813SShri Abhyankar } 57481a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 57491a83e813SShri Abhyankar } 57501a83e813SShri Abhyankar } 57511a83e813SShri Abhyankar 57521a83e813SShri Abhyankar /* finished row so stick it into b->a */ 57531a83e813SShri Abhyankar /* L part */ 57541a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 57551a83e813SShri Abhyankar pj = b->j + bi[i] ; 57561a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 57571a83e813SShri Abhyankar for (j=0; j<nz; j++) { 57581a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 57591a83e813SShri Abhyankar } 57601a83e813SShri Abhyankar 57611a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 57621a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 57631a83e813SShri Abhyankar pj = b->j + bdiag[i]; 5764e32f2f54SBarry Smith /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 57651a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 57661a83e813SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 57671a83e813SShri Abhyankar 57681a83e813SShri Abhyankar /* U part */ 57691a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 57701a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 57711a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 57721a83e813SShri Abhyankar for (j=0; j<nz; j++){ 57731a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 57741a83e813SShri Abhyankar } 57751a83e813SShri Abhyankar } 57761a83e813SShri Abhyankar 57771a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 5778fca92195SBarry Smith ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 57791a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 57801a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 57811a83e813SShri Abhyankar 5782ae3d28f0SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5783ae3d28f0SHong Zhang ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5784ace3abfcSBarry Smith both_identity = (PetscBool) (row_identity && col_identity); 5785ae3d28f0SHong Zhang if (both_identity){ 57864dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5787ae3d28f0SHong Zhang } else { 57884dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N; 5789ae3d28f0SHong Zhang } 57904dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5791ae3d28f0SHong Zhang 57921a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 5793766f9fbaSBarry Smith ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 57941a83e813SShri Abhyankar PetscFunctionReturn(0); 57951a83e813SShri Abhyankar } 57961a83e813SShri Abhyankar 57976bce7ff8SHong Zhang /* 57986bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 57994dd39f65SShri Abhyankar See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 58004dd39f65SShri Abhyankar because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 58016bce7ff8SHong Zhang */ 5802c0c7eb62SShri Abhyankar 58036bce7ff8SHong Zhang #undef __FUNCT__ 58044dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 58054dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 58066bce7ff8SHong Zhang { 58076bce7ff8SHong Zhang 58086bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 58096bce7ff8SHong Zhang PetscErrorCode ierr; 581016a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 581135aa4fcfSShri Abhyankar PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 581235aa4fcfSShri Abhyankar 581335aa4fcfSShri Abhyankar PetscFunctionBegin; 581435aa4fcfSShri Abhyankar ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 581535aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 581635aa4fcfSShri Abhyankar 581735aa4fcfSShri Abhyankar /* allocate matrix arrays for new data structure */ 581835aa4fcfSShri Abhyankar ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 581935aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 582035aa4fcfSShri Abhyankar b->singlemalloc = PETSC_TRUE; 582135aa4fcfSShri Abhyankar if (!b->diag){ 582235aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 582335aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 582435aa4fcfSShri Abhyankar } 582535aa4fcfSShri Abhyankar bdiag = b->diag; 582635aa4fcfSShri Abhyankar 582735aa4fcfSShri Abhyankar if (n > 0) { 582835aa4fcfSShri Abhyankar ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 582935aa4fcfSShri Abhyankar } 583035aa4fcfSShri Abhyankar 583135aa4fcfSShri Abhyankar /* set bi and bj with new data structure */ 583235aa4fcfSShri Abhyankar bi = b->i; 583335aa4fcfSShri Abhyankar bj = b->j; 583435aa4fcfSShri Abhyankar 583535aa4fcfSShri Abhyankar /* L part */ 583635aa4fcfSShri Abhyankar bi[0] = 0; 583735aa4fcfSShri Abhyankar for (i=0; i<n; i++){ 583835aa4fcfSShri Abhyankar nz = adiag[i] - ai[i]; 583935aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nz; 584035aa4fcfSShri Abhyankar aj = a->j + ai[i]; 584135aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 584235aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 584335aa4fcfSShri Abhyankar } 584435aa4fcfSShri Abhyankar } 584535aa4fcfSShri Abhyankar 584635aa4fcfSShri Abhyankar /* U part */ 584735aa4fcfSShri Abhyankar bi_temp = bi[n]; 584835aa4fcfSShri Abhyankar bdiag[n] = bi[n]-1; 584935aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 585035aa4fcfSShri Abhyankar nz = ai[i+1] - adiag[i] - 1; 585135aa4fcfSShri Abhyankar bi_temp = bi_temp + nz + 1; 585235aa4fcfSShri Abhyankar aj = a->j + adiag[i] + 1; 585335aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 585435aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 585535aa4fcfSShri Abhyankar } 585635aa4fcfSShri Abhyankar /* diag[i] */ 585735aa4fcfSShri Abhyankar *bj = i; bj++; 585835aa4fcfSShri Abhyankar bdiag[i] = bi_temp - 1; 585935aa4fcfSShri Abhyankar } 586035aa4fcfSShri Abhyankar PetscFunctionReturn(0); 586135aa4fcfSShri Abhyankar } 586235aa4fcfSShri Abhyankar 586335aa4fcfSShri Abhyankar #undef __FUNCT__ 58644dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 58654dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 586616a2bf60SHong Zhang { 586716a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 586816a2bf60SHong Zhang IS isicol; 586916a2bf60SHong Zhang PetscErrorCode ierr; 587016a2bf60SHong Zhang const PetscInt *r,*ic; 58717fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 587216a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 587316a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 587416a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 5875ace3abfcSBarry Smith PetscBool col_identity,row_identity,both_identity; 587616a2bf60SHong Zhang PetscReal f; 587716a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 587816a2bf60SHong Zhang PetscBT lnkbt; 587916a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 588016a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 588116a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5882ace3abfcSBarry Smith PetscBool missing; 58837fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 588416a2bf60SHong Zhang 588516a2bf60SHong Zhang PetscFunctionBegin; 5886e32f2f54SBarry Smith if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 58876ba06ab7SHong Zhang if (bs>1){ /* check shifttype */ 58886ba06ab7SHong Zhang if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE) 58896ba06ab7SHong Zhang SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix"); 58906ba06ab7SHong Zhang } 58916ba06ab7SHong Zhang 589216a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5893e32f2f54SBarry Smith if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 589416a2bf60SHong Zhang 589516a2bf60SHong Zhang f = info->fill; 589616a2bf60SHong Zhang levels = (PetscInt)info->levels; 589716a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 589816a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 589916a2bf60SHong Zhang 590016a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 590116a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5902ace3abfcSBarry Smith both_identity = (PetscBool) (row_identity && col_identity); 590316a2bf60SHong Zhang 59047fa3a6a0SHong Zhang if (!levels && both_identity) { 590516a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 59064dd39f65SShri Abhyankar ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 59074dd39f65SShri Abhyankar ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 590835aa4fcfSShri Abhyankar 5909d5f3da31SBarry Smith fact->factortype = MAT_FACTOR_ILU; 591035aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = 0; 591135aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = info->fill; 591235aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = 1.0; 591335aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 591435aa4fcfSShri Abhyankar b->row = isrow; 591535aa4fcfSShri Abhyankar b->col = iscol; 591635aa4fcfSShri Abhyankar b->icol = isicol; 591735aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 591835aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 591935aa4fcfSShri Abhyankar b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 592035aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 592135aa4fcfSShri Abhyankar PetscFunctionReturn(0); 592235aa4fcfSShri Abhyankar } 592335aa4fcfSShri Abhyankar 592435aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 592535aa4fcfSShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 592635aa4fcfSShri Abhyankar 592735aa4fcfSShri Abhyankar /* get new row pointers */ 592835aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 592935aa4fcfSShri Abhyankar bi[0] = 0; 593035aa4fcfSShri Abhyankar /* bdiag is location of diagonal in factor */ 593135aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 593235aa4fcfSShri Abhyankar bdiag[0] = 0; 593335aa4fcfSShri Abhyankar 5934fca92195SBarry Smith ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 593535aa4fcfSShri Abhyankar 593635aa4fcfSShri Abhyankar /* create a linked list for storing column indices of the active row */ 593735aa4fcfSShri Abhyankar nlnk = n + 1; 593835aa4fcfSShri Abhyankar ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 593935aa4fcfSShri Abhyankar 594035aa4fcfSShri Abhyankar /* initial FreeSpace size is f*(ai[n]+1) */ 594135aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 594235aa4fcfSShri Abhyankar current_space = free_space; 594335aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 594435aa4fcfSShri Abhyankar current_space_lvl = free_space_lvl; 594535aa4fcfSShri Abhyankar 594635aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 594735aa4fcfSShri Abhyankar nzi = 0; 594835aa4fcfSShri Abhyankar /* copy current row into linked list */ 594935aa4fcfSShri Abhyankar nnz = ai[r[i]+1] - ai[r[i]]; 5950e32f2f54SBarry Smith if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 595135aa4fcfSShri Abhyankar cols = aj + ai[r[i]]; 595235aa4fcfSShri Abhyankar lnk[i] = -1; /* marker to indicate if diagonal exists */ 595335aa4fcfSShri Abhyankar ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 595435aa4fcfSShri Abhyankar nzi += nlnk; 595535aa4fcfSShri Abhyankar 595635aa4fcfSShri Abhyankar /* make sure diagonal entry is included */ 595735aa4fcfSShri Abhyankar if (diagonal_fill && lnk[i] == -1) { 595835aa4fcfSShri Abhyankar fm = n; 595935aa4fcfSShri Abhyankar while (lnk[fm] < i) fm = lnk[fm]; 596035aa4fcfSShri Abhyankar lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 596135aa4fcfSShri Abhyankar lnk[fm] = i; 596235aa4fcfSShri Abhyankar lnk_lvl[i] = 0; 596335aa4fcfSShri Abhyankar nzi++; dcount++; 596435aa4fcfSShri Abhyankar } 596535aa4fcfSShri Abhyankar 596635aa4fcfSShri Abhyankar /* add pivot rows into the active row */ 596735aa4fcfSShri Abhyankar nzbd = 0; 596835aa4fcfSShri Abhyankar prow = lnk[n]; 596935aa4fcfSShri Abhyankar while (prow < i) { 597035aa4fcfSShri Abhyankar nnz = bdiag[prow]; 597135aa4fcfSShri Abhyankar cols = bj_ptr[prow] + nnz + 1; 597235aa4fcfSShri Abhyankar cols_lvl = bjlvl_ptr[prow] + nnz + 1; 597335aa4fcfSShri Abhyankar nnz = bi[prow+1] - bi[prow] - nnz - 1; 597435aa4fcfSShri Abhyankar ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 597535aa4fcfSShri Abhyankar nzi += nlnk; 597635aa4fcfSShri Abhyankar prow = lnk[prow]; 597735aa4fcfSShri Abhyankar nzbd++; 597835aa4fcfSShri Abhyankar } 597935aa4fcfSShri Abhyankar bdiag[i] = nzbd; 598035aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nzi; 598135aa4fcfSShri Abhyankar 598235aa4fcfSShri Abhyankar /* if free space is not available, make more free space */ 598335aa4fcfSShri Abhyankar if (current_space->local_remaining<nzi) { 598435aa4fcfSShri Abhyankar nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 598535aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 598635aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 598735aa4fcfSShri Abhyankar reallocs++; 598835aa4fcfSShri Abhyankar } 598935aa4fcfSShri Abhyankar 599035aa4fcfSShri Abhyankar /* copy data into free_space and free_space_lvl, then initialize lnk */ 599135aa4fcfSShri Abhyankar ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 599235aa4fcfSShri Abhyankar bj_ptr[i] = current_space->array; 599335aa4fcfSShri Abhyankar bjlvl_ptr[i] = current_space_lvl->array; 599435aa4fcfSShri Abhyankar 599535aa4fcfSShri Abhyankar /* make sure the active row i has diagonal entry */ 599665e19b50SBarry Smith if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 599735aa4fcfSShri Abhyankar 599835aa4fcfSShri Abhyankar current_space->array += nzi; 599935aa4fcfSShri Abhyankar current_space->local_used += nzi; 600035aa4fcfSShri Abhyankar current_space->local_remaining -= nzi; 600135aa4fcfSShri Abhyankar current_space_lvl->array += nzi; 600235aa4fcfSShri Abhyankar current_space_lvl->local_used += nzi; 600335aa4fcfSShri Abhyankar current_space_lvl->local_remaining -= nzi; 600435aa4fcfSShri Abhyankar } 600535aa4fcfSShri Abhyankar 600635aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 600735aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 600835aa4fcfSShri Abhyankar 600935aa4fcfSShri Abhyankar /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 60109263d837SHong Zhang ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 60112ce24eb6SHong Zhang ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 601235aa4fcfSShri Abhyankar 601335aa4fcfSShri Abhyankar ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 601435aa4fcfSShri Abhyankar ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 6015fca92195SBarry Smith ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 601635aa4fcfSShri Abhyankar 601735aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO) 601835aa4fcfSShri Abhyankar { 6019aef85c9fSShri Abhyankar PetscReal af = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 602035aa4fcfSShri Abhyankar ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 602135aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 602235aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 602335aa4fcfSShri Abhyankar ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 602435aa4fcfSShri Abhyankar if (diagonal_fill) { 602535aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 602635aa4fcfSShri Abhyankar } 602735aa4fcfSShri Abhyankar } 602835aa4fcfSShri Abhyankar #endif 602935aa4fcfSShri Abhyankar 603035aa4fcfSShri Abhyankar /* put together the new matrix */ 603135aa4fcfSShri Abhyankar ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 603235aa4fcfSShri Abhyankar ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 603335aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 603435aa4fcfSShri Abhyankar b->free_a = PETSC_TRUE; 603535aa4fcfSShri Abhyankar b->free_ij = PETSC_TRUE; 603635aa4fcfSShri Abhyankar b->singlemalloc = PETSC_FALSE; 603735aa4fcfSShri Abhyankar ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 603835aa4fcfSShri Abhyankar b->j = bj; 603935aa4fcfSShri Abhyankar b->i = bi; 604035aa4fcfSShri Abhyankar b->diag = bdiag; 604135aa4fcfSShri Abhyankar b->free_diag = PETSC_TRUE; 604235aa4fcfSShri Abhyankar b->ilen = 0; 604335aa4fcfSShri Abhyankar b->imax = 0; 604435aa4fcfSShri Abhyankar b->row = isrow; 604535aa4fcfSShri Abhyankar b->col = iscol; 604635aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 604735aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 604835aa4fcfSShri Abhyankar b->icol = isicol; 604935aa4fcfSShri Abhyankar ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 605035aa4fcfSShri Abhyankar /* In b structure: Free imax, ilen, old a, old j. 605135aa4fcfSShri Abhyankar Allocate bdiag, solve_work, new a, new j */ 605235aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 605335aa4fcfSShri Abhyankar b->maxnz = b->nz = bdiag[0]+1; 6054ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocs; 6055ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 6056ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 60574dd39f65SShri Abhyankar ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 605835aa4fcfSShri Abhyankar PetscFunctionReturn(0); 605935aa4fcfSShri Abhyankar } 606035aa4fcfSShri Abhyankar 60614e2b4712SSatish Balay /* 60624e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 60634e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 60644e2b4712SSatish Balay Not a good example of code reuse. 60654e2b4712SSatish Balay */ 60664a2ae208SSatish Balay #undef __FUNCT__ 606706e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 606806e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 60694e2b4712SSatish Balay { 60704e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 60714e2b4712SSatish Balay IS isicol; 60726849ba73SBarry Smith PetscErrorCode ierr; 60735d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 60745d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 6075a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 6076d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 6077ace3abfcSBarry Smith PetscBool col_identity,row_identity,both_identity,flg; 6078329f5518SBarry Smith PetscReal f; 60794e2b4712SSatish Balay 60804e2b4712SSatish Balay PetscFunctionBegin; 60816bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 6082e32f2f54SBarry Smith if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 60836bce7ff8SHong Zhang 6084435faa5fSBarry Smith f = info->fill; 6085690b6cddSBarry Smith levels = (PetscInt)info->levels; 6086690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 60874c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 608816a2bf60SHong Zhang 6089667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 6090667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 6091ace3abfcSBarry Smith both_identity = (PetscBool) (row_identity && col_identity); 6092309c388cSBarry Smith 609341df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 609416a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 60958b1456e3SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 60966bce7ff8SHong Zhang 6097d5f3da31SBarry Smith fact->factortype = MAT_FACTOR_ILU; 6098ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 6099bb3d539aSBarry Smith b->row = isrow; 6100bb3d539aSBarry Smith b->col = iscol; 6101bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6102bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6103bb3d539aSBarry Smith b->icol = isicol; 6104bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6105b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 61066bce7ff8SHong Zhang PetscFunctionReturn(0); 61076bce7ff8SHong Zhang } 61086bce7ff8SHong Zhang 61096bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 61104e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 61114e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 61124e2b4712SSatish Balay 61134e2b4712SSatish Balay /* get new row pointers */ 6114690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 61154e2b4712SSatish Balay ainew[0] = 0; 61164e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 6117690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 6118690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 61194e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 6120690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 61214e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 6122690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 61234e2b4712SSatish Balay /* im is level for each filled value */ 6124690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 61254e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 6126690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 61274e2b4712SSatish Balay dloc[0] = 0; 61284e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 6129435faa5fSBarry Smith 6130435faa5fSBarry Smith /* copy prow into linked list */ 61314e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 6132e32f2f54SBarry Smith if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 61334e2b4712SSatish Balay xi = aj + ai[r[prow]]; 61344e2b4712SSatish Balay fill[n] = n; 6135435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 61364e2b4712SSatish Balay while (nz--) { 61374e2b4712SSatish Balay fm = n; 61384e2b4712SSatish Balay idx = ic[*xi++]; 61394e2b4712SSatish Balay do { 61404e2b4712SSatish Balay m = fm; 61414e2b4712SSatish Balay fm = fill[m]; 61424e2b4712SSatish Balay } while (fm < idx); 61434e2b4712SSatish Balay fill[m] = idx; 61444e2b4712SSatish Balay fill[idx] = fm; 61454e2b4712SSatish Balay im[idx] = 0; 61464e2b4712SSatish Balay } 6147435faa5fSBarry Smith 6148435faa5fSBarry Smith /* make sure diagonal entry is included */ 6149435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 6150435faa5fSBarry Smith fm = n; 6151435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 6152435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 6153435faa5fSBarry Smith fill[fm] = prow; 6154435faa5fSBarry Smith im[prow] = 0; 6155435faa5fSBarry Smith nzf++; 6156335d9088SBarry Smith dcount++; 6157435faa5fSBarry Smith } 6158435faa5fSBarry Smith 61594e2b4712SSatish Balay nzi = 0; 61604e2b4712SSatish Balay row = fill[n]; 61614e2b4712SSatish Balay while (row < prow) { 61624e2b4712SSatish Balay incrlev = im[row] + 1; 61634e2b4712SSatish Balay nz = dloc[row]; 6164435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 61654e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 61664e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 61674e2b4712SSatish Balay fm = row; 61684e2b4712SSatish Balay while (nnz-- > 0) { 61694e2b4712SSatish Balay idx = *xi++; 61704e2b4712SSatish Balay if (*flev + incrlev > levels) { 61714e2b4712SSatish Balay flev++; 61724e2b4712SSatish Balay continue; 61734e2b4712SSatish Balay } 61744e2b4712SSatish Balay do { 61754e2b4712SSatish Balay m = fm; 61764e2b4712SSatish Balay fm = fill[m]; 61774e2b4712SSatish Balay } while (fm < idx); 61784e2b4712SSatish Balay if (fm != idx) { 61794e2b4712SSatish Balay im[idx] = *flev + incrlev; 61804e2b4712SSatish Balay fill[m] = idx; 61814e2b4712SSatish Balay fill[idx] = fm; 61824e2b4712SSatish Balay fm = idx; 61834e2b4712SSatish Balay nzf++; 6184ecf371e4SBarry Smith } else { 61854e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 61864e2b4712SSatish Balay } 61874e2b4712SSatish Balay flev++; 61884e2b4712SSatish Balay } 61894e2b4712SSatish Balay row = fill[row]; 61904e2b4712SSatish Balay nzi++; 61914e2b4712SSatish Balay } 61924e2b4712SSatish Balay /* copy new filled row into permanent storage */ 61934e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 61944e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 6195ecf371e4SBarry Smith 6196ecf371e4SBarry Smith /* estimate how much additional space we will need */ 6197ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 6198ecf371e4SBarry Smith /* just double the memory each time */ 6199690b6cddSBarry Smith PetscInt maxadd = jmax; 6200ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 62014e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 62024e2b4712SSatish Balay jmax += maxadd; 6203ecf371e4SBarry Smith 6204ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 62055d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 62065d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6207606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 62085d0c19d7SBarry Smith ajnew = xitmp; 62095d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 62105d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6211606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 62125d0c19d7SBarry Smith ajfill = xitmp; 6213eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 62144e2b4712SSatish Balay } 62155d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 62164e2b4712SSatish Balay flev = ajfill + ainew[prow]; 62174e2b4712SSatish Balay dloc[prow] = nzi; 62184e2b4712SSatish Balay fm = fill[n]; 62194e2b4712SSatish Balay while (nzf--) { 62205d0c19d7SBarry Smith *xitmp++ = fm; 62214e2b4712SSatish Balay *flev++ = im[fm]; 62224e2b4712SSatish Balay fm = fill[fm]; 62234e2b4712SSatish Balay } 6224435faa5fSBarry Smith /* make sure row has diagonal entry */ 6225435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 6226e32f2f54SBarry Smith SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 62272401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 6228435faa5fSBarry Smith } 62294e2b4712SSatish Balay } 6230606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 62314e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 62324e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6233606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 6234606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 62354e2b4712SSatish Balay 62366cf91177SBarry Smith #if defined(PETSC_USE_INFO) 62374e2b4712SSatish Balay { 6238329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 6239ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 6240ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6241ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 6242ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6243335d9088SBarry Smith if (diagonal_fill) { 6244ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 6245335d9088SBarry Smith } 62464e2b4712SSatish Balay } 624763ba0a88SBarry Smith #endif 62484e2b4712SSatish Balay 62494e2b4712SSatish Balay /* put together the new matrix */ 6250719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6251719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6252ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 6253e6b907acSBarry Smith b->free_a = PETSC_TRUE; 6254e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 62557c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 6256a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 62574e2b4712SSatish Balay b->j = ajnew; 62584e2b4712SSatish Balay b->i = ainew; 62594e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 62604e2b4712SSatish Balay b->diag = dloc; 62617f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 62624e2b4712SSatish Balay b->ilen = 0; 62634e2b4712SSatish Balay b->imax = 0; 62644e2b4712SSatish Balay b->row = isrow; 62654e2b4712SSatish Balay b->col = iscol; 6266bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6267c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6268c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6269e51c0b9cSSatish Balay b->icol = isicol; 627087828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 62714e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 62724e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 6273719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 62744e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 62754e2b4712SSatish Balay 6276ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocate; 6277ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 6278ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 62796bce7ff8SHong Zhang 62808b1456e3SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 62818661488fSKris Buschelman PetscFunctionReturn(0); 62828661488fSKris Buschelman } 62838661488fSKris Buschelman 6284732ee342SKris Buschelman #undef __FUNCT__ 62857e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 6286dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 62877e7071cdSKris Buschelman { 628812272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 628912272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 62905a9542e3SKris Buschelman PetscFunctionBegin; 62917cf1b8d3SKris Buschelman /* Undo Column scaling */ 62927cf1b8d3SKris Buschelman /* while (nz--) { */ 62937cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 62947cf1b8d3SKris Buschelman /* } */ 6295c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 6296c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 62977cf1b8d3SKris Buschelman PetscFunctionReturn(0); 62987cf1b8d3SKris Buschelman } 62997cf1b8d3SKris Buschelman 63007cf1b8d3SKris Buschelman #undef __FUNCT__ 63017cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 6302dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 63037cf1b8d3SKris Buschelman { 63047cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6305b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 63062aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 63075a9542e3SKris Buschelman PetscFunctionBegin; 63080b9da03eSKris Buschelman /* Is this really necessary? */ 630920235379SKris Buschelman while (nz--) { 63100b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 63117e7071cdSKris Buschelman } 6312c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 63137e7071cdSKris Buschelman PetscFunctionReturn(0); 63147e7071cdSKris Buschelman } 63157e7071cdSKris Buschelman 6316732ee342SKris Buschelman 6317