1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 3a4005a5dSBarry Smith 44e2b4712SSatish Balay /* 54e2b4712SSatish Balay Factorization code for BAIJ format. 64e2b4712SSatish Balay */ 74e2b4712SSatish Balay 87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 9c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 1016a2bf60SHong Zhang #include "petscbt.h" 1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h" 124e2b4712SSatish Balay 134a2ae208SSatish Balay #undef __FUNCT__ 144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16f1af5d2fSBarry Smith { 17f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18dfbe8321SBarry Smith PetscErrorCode ierr; 19690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20690b6cddSBarry Smith PetscInt *diag = a->diag; 21f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 2287828ca2SBarry Smith PetscScalar s1,*x,*b; 23f1af5d2fSBarry Smith 24f1af5d2fSBarry Smith PetscFunctionBegin; 25ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 261ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 271ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28f1af5d2fSBarry Smith 29f1af5d2fSBarry Smith /* forward solve the U^T */ 30f1af5d2fSBarry Smith for (i=0; i<n; i++) { 31f1af5d2fSBarry Smith 32f1af5d2fSBarry Smith v = aa + diag[i]; 33f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 34ef66eb69SBarry Smith s1 = (*v++)*x[i]; 35f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 36f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 37f1af5d2fSBarry Smith while (nz--) { 38f1af5d2fSBarry Smith x[*vi++] -= (*v++)*s1; 39f1af5d2fSBarry Smith } 40f1af5d2fSBarry Smith x[i] = s1; 41f1af5d2fSBarry Smith } 42f1af5d2fSBarry Smith /* backward solve the L^T */ 43f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 44f1af5d2fSBarry Smith v = aa + diag[i] - 1; 45f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 46f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 47f1af5d2fSBarry Smith s1 = x[i]; 48f1af5d2fSBarry Smith while (nz--) { 49f1af5d2fSBarry Smith x[*vi--] -= (*v--)*s1; 50f1af5d2fSBarry Smith } 51f1af5d2fSBarry Smith } 521ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 531ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55f1af5d2fSBarry Smith PetscFunctionReturn(0); 56f1af5d2fSBarry Smith } 57f1af5d2fSBarry Smith 584a2ae208SSatish Balay #undef __FUNCT__ 594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61f1af5d2fSBarry Smith { 62f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63dfbe8321SBarry Smith PetscErrorCode ierr; 64690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 66f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 6787828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 6887828ca2SBarry Smith PetscScalar *x,*b; 69f1af5d2fSBarry Smith 70f1af5d2fSBarry Smith PetscFunctionBegin; 71ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 721ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 731ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74f1af5d2fSBarry Smith 75f1af5d2fSBarry Smith /* forward solve the U^T */ 76f1af5d2fSBarry Smith idx = 0; 77f1af5d2fSBarry Smith for (i=0; i<n; i++) { 78f1af5d2fSBarry Smith 79f1af5d2fSBarry Smith v = aa + 4*diag[i]; 80f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 81ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 82f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 83f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 84f1af5d2fSBarry Smith v += 4; 85f1af5d2fSBarry Smith 86f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 87f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 88f1af5d2fSBarry Smith while (nz--) { 89f1af5d2fSBarry Smith oidx = 2*(*vi++); 90f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2; 91f1af5d2fSBarry Smith x[oidx+1] -= v[2]*s1 + v[3]*s2; 92f1af5d2fSBarry Smith v += 4; 93f1af5d2fSBarry Smith } 94f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; 95f1af5d2fSBarry Smith idx += 2; 96f1af5d2fSBarry Smith } 97f1af5d2fSBarry Smith /* backward solve the L^T */ 98f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 99f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 100f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 101f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 102f1af5d2fSBarry Smith idt = 2*i; 103f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 104f1af5d2fSBarry Smith while (nz--) { 105f1af5d2fSBarry Smith idx = 2*(*vi--); 106f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2; 107f1af5d2fSBarry Smith x[idx+1] -= v[2]*s1 + v[3]*s2; 108f1af5d2fSBarry Smith v -= 4; 109f1af5d2fSBarry Smith } 110f1af5d2fSBarry Smith } 1111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114f1af5d2fSBarry Smith PetscFunctionReturn(0); 115f1af5d2fSBarry Smith } 116f1af5d2fSBarry Smith 1174a2ae208SSatish Balay #undef __FUNCT__ 1186929473cSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct" 1196929473cSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1206929473cSShri Abhyankar { 1216929473cSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1226929473cSShri Abhyankar PetscErrorCode ierr; 1236929473cSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1246929473cSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 1256929473cSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 1266929473cSShri Abhyankar MatScalar *aa=a->a,*v; 1276929473cSShri Abhyankar PetscScalar s1,s2,x1,x2; 1286929473cSShri Abhyankar PetscScalar *x,*b; 1296929473cSShri Abhyankar 1306929473cSShri Abhyankar PetscFunctionBegin; 1316929473cSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1326929473cSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1336929473cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1346929473cSShri Abhyankar 1356929473cSShri Abhyankar /* forward solve the U^T */ 1366929473cSShri Abhyankar idx = 0; 1376929473cSShri Abhyankar for (i=0; i<n; i++) { 1386929473cSShri Abhyankar v = aa + bs2*diag[i]; 1396929473cSShri Abhyankar /* multiply by the inverse of the block diagonal */ 1406929473cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 1416929473cSShri Abhyankar s1 = v[0]*x1 + v[1]*x2; 1426929473cSShri Abhyankar s2 = v[2]*x1 + v[3]*x2; 1436929473cSShri Abhyankar v -= bs2; 1446929473cSShri Abhyankar 1456929473cSShri Abhyankar vi = aj + diag[i] - 1; 1466929473cSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 1476929473cSShri Abhyankar for(j=0;j>-nz;j--){ 1486929473cSShri Abhyankar oidx = bs*vi[j]; 1496929473cSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2; 1506929473cSShri Abhyankar x[oidx+1] -= v[2]*s1 + v[3]*s2; 1516929473cSShri Abhyankar v -= bs2; 1526929473cSShri Abhyankar } 1536929473cSShri Abhyankar x[idx] = s1;x[1+idx] = s2; 1546929473cSShri Abhyankar idx += bs; 1556929473cSShri Abhyankar } 1566929473cSShri Abhyankar /* backward solve the L^T */ 1576929473cSShri Abhyankar for (i=n-1; i>=0; i--){ 1586929473cSShri Abhyankar v = aa + bs2*ai[i]; 1596929473cSShri Abhyankar vi = aj + ai[i]; 1606929473cSShri Abhyankar nz = ai[i+1] - ai[i]; 1616929473cSShri Abhyankar idt = bs*i; 1626929473cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 1636929473cSShri Abhyankar for(j=0;j<nz;j++){ 1646929473cSShri Abhyankar idx = bs*vi[j]; 1656929473cSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2; 1666929473cSShri Abhyankar x[idx+1] -= v[2]*s1 + v[3]*s2; 1676929473cSShri Abhyankar v += bs2; 1686929473cSShri Abhyankar } 1696929473cSShri Abhyankar } 1706929473cSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1716929473cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1726929473cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1736929473cSShri Abhyankar PetscFunctionReturn(0); 1746929473cSShri Abhyankar } 1756929473cSShri Abhyankar 1766929473cSShri Abhyankar #undef __FUNCT__ 1774a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 178dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 179f1af5d2fSBarry Smith { 180f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 181dfbe8321SBarry Smith PetscErrorCode ierr; 182690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 183690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 184f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 18587828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 18687828ca2SBarry Smith PetscScalar *x,*b; 187f1af5d2fSBarry Smith 188f1af5d2fSBarry Smith PetscFunctionBegin; 189ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 1901ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1911ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192f1af5d2fSBarry Smith 193f1af5d2fSBarry Smith /* forward solve the U^T */ 194f1af5d2fSBarry Smith idx = 0; 195f1af5d2fSBarry Smith for (i=0; i<n; i++) { 196f1af5d2fSBarry Smith 197f1af5d2fSBarry Smith v = aa + 9*diag[i]; 198f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 199ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 200f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 201f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 202f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 203f1af5d2fSBarry Smith v += 9; 204f1af5d2fSBarry Smith 205f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 206f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 207f1af5d2fSBarry Smith while (nz--) { 208f1af5d2fSBarry Smith oidx = 3*(*vi++); 209f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 210f1af5d2fSBarry Smith x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 211f1af5d2fSBarry Smith x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 212f1af5d2fSBarry Smith v += 9; 213f1af5d2fSBarry Smith } 214f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 215f1af5d2fSBarry Smith idx += 3; 216f1af5d2fSBarry Smith } 217f1af5d2fSBarry Smith /* backward solve the L^T */ 218f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 219f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 220f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 221f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 222f1af5d2fSBarry Smith idt = 3*i; 223f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 224f1af5d2fSBarry Smith while (nz--) { 225f1af5d2fSBarry Smith idx = 3*(*vi--); 226f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 227f1af5d2fSBarry Smith x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 228f1af5d2fSBarry Smith x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 229f1af5d2fSBarry Smith v -= 9; 230f1af5d2fSBarry Smith } 231f1af5d2fSBarry Smith } 2321ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2331ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 235f1af5d2fSBarry Smith PetscFunctionReturn(0); 236f1af5d2fSBarry Smith } 237f1af5d2fSBarry Smith 2384a2ae208SSatish Balay #undef __FUNCT__ 239*8499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct" 240*8499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 241*8499736aSShri Abhyankar { 242*8499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 243*8499736aSShri Abhyankar PetscErrorCode ierr; 244*8499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 245*8499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 246*8499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 247*8499736aSShri Abhyankar MatScalar *aa=a->a,*v; 248*8499736aSShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 249*8499736aSShri Abhyankar PetscScalar *x,*b; 250*8499736aSShri Abhyankar 251*8499736aSShri Abhyankar PetscFunctionBegin; 252*8499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 253*8499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 254*8499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 255*8499736aSShri Abhyankar 256*8499736aSShri Abhyankar /* forward solve the U^T */ 257*8499736aSShri Abhyankar idx = 0; 258*8499736aSShri Abhyankar for (i=0; i<n; i++) { 259*8499736aSShri Abhyankar v = aa + bs2*diag[i]; 260*8499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 261*8499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 262*8499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 263*8499736aSShri Abhyankar s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 264*8499736aSShri Abhyankar s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 265*8499736aSShri Abhyankar v -= bs2; 266*8499736aSShri Abhyankar 267*8499736aSShri Abhyankar vi = aj + diag[i] - 1; 268*8499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 269*8499736aSShri Abhyankar for(j=0;j>-nz;j--){ 270*8499736aSShri Abhyankar oidx = bs*vi[j]; 271*8499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 272*8499736aSShri Abhyankar x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 273*8499736aSShri Abhyankar x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 274*8499736aSShri Abhyankar v -= bs2; 275*8499736aSShri Abhyankar } 276*8499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 277*8499736aSShri Abhyankar idx += bs; 278*8499736aSShri Abhyankar } 279*8499736aSShri Abhyankar /* backward solve the L^T */ 280*8499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 281*8499736aSShri Abhyankar v = aa + bs2*ai[i]; 282*8499736aSShri Abhyankar vi = aj + ai[i]; 283*8499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 284*8499736aSShri Abhyankar idt = bs*i; 285*8499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 286*8499736aSShri Abhyankar for(j=0;j<nz;j++){ 287*8499736aSShri Abhyankar idx = bs*vi[j]; 288*8499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 289*8499736aSShri Abhyankar x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 290*8499736aSShri Abhyankar x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 291*8499736aSShri Abhyankar v += bs2; 292*8499736aSShri Abhyankar } 293*8499736aSShri Abhyankar } 294*8499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 295*8499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 296*8499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 297*8499736aSShri Abhyankar PetscFunctionReturn(0); 298*8499736aSShri Abhyankar } 299*8499736aSShri Abhyankar 300*8499736aSShri Abhyankar #undef __FUNCT__ 3014a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 302dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 303f1af5d2fSBarry Smith { 304f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 305dfbe8321SBarry Smith PetscErrorCode ierr; 306690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 307690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 308f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 30987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 31087828ca2SBarry Smith PetscScalar *x,*b; 311f1af5d2fSBarry Smith 312f1af5d2fSBarry Smith PetscFunctionBegin; 313ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 3141ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3151ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 316f1af5d2fSBarry Smith 317f1af5d2fSBarry Smith /* forward solve the U^T */ 318f1af5d2fSBarry Smith idx = 0; 319f1af5d2fSBarry Smith for (i=0; i<n; i++) { 320f1af5d2fSBarry Smith 321f1af5d2fSBarry Smith v = aa + 16*diag[i]; 322f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 323ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 324f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 325f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 326f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 327f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 328f1af5d2fSBarry Smith v += 16; 329f1af5d2fSBarry Smith 330f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 331f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 332f1af5d2fSBarry Smith while (nz--) { 333f1af5d2fSBarry Smith oidx = 4*(*vi++); 334f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 335f1af5d2fSBarry Smith x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 336f1af5d2fSBarry Smith x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 337f1af5d2fSBarry Smith x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 338f1af5d2fSBarry Smith v += 16; 339f1af5d2fSBarry Smith } 340f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 341f1af5d2fSBarry Smith idx += 4; 342f1af5d2fSBarry Smith } 343f1af5d2fSBarry Smith /* backward solve the L^T */ 344f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 345f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 346f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 347f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 348f1af5d2fSBarry Smith idt = 4*i; 349f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 350f1af5d2fSBarry Smith while (nz--) { 351f1af5d2fSBarry Smith idx = 4*(*vi--); 352f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 353f1af5d2fSBarry Smith x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 354f1af5d2fSBarry Smith x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 355f1af5d2fSBarry Smith x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 356f1af5d2fSBarry Smith v -= 16; 357f1af5d2fSBarry Smith } 358f1af5d2fSBarry Smith } 3591ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3601ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 361dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 362f1af5d2fSBarry Smith PetscFunctionReturn(0); 363f1af5d2fSBarry Smith } 364f1af5d2fSBarry Smith 3654a2ae208SSatish Balay #undef __FUNCT__ 366*8499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct" 367*8499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 368*8499736aSShri Abhyankar { 369*8499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 370*8499736aSShri Abhyankar PetscErrorCode ierr; 371*8499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 372*8499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 373*8499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 374*8499736aSShri Abhyankar MatScalar *aa=a->a,*v; 375*8499736aSShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 376*8499736aSShri Abhyankar PetscScalar *x,*b; 377*8499736aSShri Abhyankar 378*8499736aSShri Abhyankar PetscFunctionBegin; 379*8499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 380*8499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 381*8499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 382*8499736aSShri Abhyankar 383*8499736aSShri Abhyankar /* forward solve the U^T */ 384*8499736aSShri Abhyankar idx = 0; 385*8499736aSShri Abhyankar for (i=0; i<n; i++) { 386*8499736aSShri Abhyankar v = aa + bs2*diag[i]; 387*8499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 388*8499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 389*8499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 390*8499736aSShri Abhyankar s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 391*8499736aSShri Abhyankar s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 392*8499736aSShri Abhyankar s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 393*8499736aSShri Abhyankar v -= bs2; 394*8499736aSShri Abhyankar 395*8499736aSShri Abhyankar vi = aj + diag[i] - 1; 396*8499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 397*8499736aSShri Abhyankar for(j=0;j>-nz;j--){ 398*8499736aSShri Abhyankar oidx = bs*vi[j]; 399*8499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 400*8499736aSShri Abhyankar x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 401*8499736aSShri Abhyankar x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 402*8499736aSShri Abhyankar x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 403*8499736aSShri Abhyankar v -= bs2; 404*8499736aSShri Abhyankar } 405*8499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 406*8499736aSShri Abhyankar idx += bs; 407*8499736aSShri Abhyankar } 408*8499736aSShri Abhyankar /* backward solve the L^T */ 409*8499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 410*8499736aSShri Abhyankar v = aa + bs2*ai[i]; 411*8499736aSShri Abhyankar vi = aj + ai[i]; 412*8499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 413*8499736aSShri Abhyankar idt = bs*i; 414*8499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 415*8499736aSShri Abhyankar for(j=0;j<nz;j++){ 416*8499736aSShri Abhyankar idx = bs*vi[j]; 417*8499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 418*8499736aSShri Abhyankar x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 419*8499736aSShri Abhyankar x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 420*8499736aSShri Abhyankar x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 421*8499736aSShri Abhyankar v += bs2; 422*8499736aSShri Abhyankar } 423*8499736aSShri Abhyankar } 424*8499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 425*8499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 426*8499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 427*8499736aSShri Abhyankar PetscFunctionReturn(0); 428*8499736aSShri Abhyankar } 429*8499736aSShri Abhyankar 430*8499736aSShri Abhyankar #undef __FUNCT__ 4314a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 432dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 433f1af5d2fSBarry Smith { 434f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 435dfbe8321SBarry Smith PetscErrorCode ierr; 436690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 437690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 438f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 43987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 44087828ca2SBarry Smith PetscScalar *x,*b; 441f1af5d2fSBarry Smith 442f1af5d2fSBarry Smith PetscFunctionBegin; 443ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 4441ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4451ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 446f1af5d2fSBarry Smith 447f1af5d2fSBarry Smith /* forward solve the U^T */ 448f1af5d2fSBarry Smith idx = 0; 449f1af5d2fSBarry Smith for (i=0; i<n; i++) { 450f1af5d2fSBarry Smith 451f1af5d2fSBarry Smith v = aa + 25*diag[i]; 452f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 453ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 454f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 455f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 456f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 457f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 458f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 459f1af5d2fSBarry Smith v += 25; 460f1af5d2fSBarry Smith 461f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 462f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 463f1af5d2fSBarry Smith while (nz--) { 464f1af5d2fSBarry Smith oidx = 5*(*vi++); 465f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 466f1af5d2fSBarry Smith x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 467f1af5d2fSBarry Smith x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 468f1af5d2fSBarry Smith x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 469f1af5d2fSBarry Smith x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 470f1af5d2fSBarry Smith v += 25; 471f1af5d2fSBarry Smith } 472f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 473f1af5d2fSBarry Smith idx += 5; 474f1af5d2fSBarry Smith } 475f1af5d2fSBarry Smith /* backward solve the L^T */ 476f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 477f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 478f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 479f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 480f1af5d2fSBarry Smith idt = 5*i; 481f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 482f1af5d2fSBarry Smith while (nz--) { 483f1af5d2fSBarry Smith idx = 5*(*vi--); 484f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 485f1af5d2fSBarry Smith x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 486f1af5d2fSBarry Smith x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 487f1af5d2fSBarry Smith x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 488f1af5d2fSBarry Smith x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 489f1af5d2fSBarry Smith v -= 25; 490f1af5d2fSBarry Smith } 491f1af5d2fSBarry Smith } 4921ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4931ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 494dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 495f1af5d2fSBarry Smith PetscFunctionReturn(0); 496f1af5d2fSBarry Smith } 497f1af5d2fSBarry Smith 4984a2ae208SSatish Balay #undef __FUNCT__ 499*8499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct" 500*8499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 501*8499736aSShri Abhyankar { 502*8499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 503*8499736aSShri Abhyankar PetscErrorCode ierr; 504*8499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 505*8499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 506*8499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 507*8499736aSShri Abhyankar MatScalar *aa=a->a,*v; 508*8499736aSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 509*8499736aSShri Abhyankar PetscScalar *x,*b; 510*8499736aSShri Abhyankar 511*8499736aSShri Abhyankar PetscFunctionBegin; 512*8499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 513*8499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 514*8499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 515*8499736aSShri Abhyankar 516*8499736aSShri Abhyankar /* forward solve the U^T */ 517*8499736aSShri Abhyankar idx = 0; 518*8499736aSShri Abhyankar for (i=0; i<n; i++) { 519*8499736aSShri Abhyankar v = aa + bs2*diag[i]; 520*8499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 521*8499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 522*8499736aSShri Abhyankar x5 = x[4+idx]; 523*8499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 524*8499736aSShri Abhyankar s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 525*8499736aSShri Abhyankar s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 526*8499736aSShri Abhyankar s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 527*8499736aSShri Abhyankar s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 528*8499736aSShri Abhyankar v -= bs2; 529*8499736aSShri Abhyankar 530*8499736aSShri Abhyankar vi = aj + diag[i] - 1; 531*8499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 532*8499736aSShri Abhyankar for(j=0;j>-nz;j--){ 533*8499736aSShri Abhyankar oidx = bs*vi[j]; 534*8499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 535*8499736aSShri Abhyankar x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 536*8499736aSShri Abhyankar x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 537*8499736aSShri Abhyankar x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 538*8499736aSShri Abhyankar x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 539*8499736aSShri Abhyankar v -= bs2; 540*8499736aSShri Abhyankar } 541*8499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 542*8499736aSShri Abhyankar idx += bs; 543*8499736aSShri Abhyankar } 544*8499736aSShri Abhyankar /* backward solve the L^T */ 545*8499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 546*8499736aSShri Abhyankar v = aa + bs2*ai[i]; 547*8499736aSShri Abhyankar vi = aj + ai[i]; 548*8499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 549*8499736aSShri Abhyankar idt = bs*i; 550*8499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 551*8499736aSShri Abhyankar for(j=0;j<nz;j++){ 552*8499736aSShri Abhyankar idx = bs*vi[j]; 553*8499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 554*8499736aSShri Abhyankar x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 555*8499736aSShri Abhyankar x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 556*8499736aSShri Abhyankar x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 557*8499736aSShri Abhyankar x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 558*8499736aSShri Abhyankar v += bs2; 559*8499736aSShri Abhyankar } 560*8499736aSShri Abhyankar } 561*8499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 562*8499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 563*8499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 564*8499736aSShri Abhyankar PetscFunctionReturn(0); 565*8499736aSShri Abhyankar } 566*8499736aSShri Abhyankar 567*8499736aSShri Abhyankar #undef __FUNCT__ 5684a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 569dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 570f1af5d2fSBarry Smith { 571f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 572dfbe8321SBarry Smith PetscErrorCode ierr; 573690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 574690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 575f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 57687828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 57787828ca2SBarry Smith PetscScalar *x,*b; 578f1af5d2fSBarry Smith 579f1af5d2fSBarry Smith PetscFunctionBegin; 580ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 5811ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5821ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 583f1af5d2fSBarry Smith 584f1af5d2fSBarry Smith /* forward solve the U^T */ 585f1af5d2fSBarry Smith idx = 0; 586f1af5d2fSBarry Smith for (i=0; i<n; i++) { 587f1af5d2fSBarry Smith 588f1af5d2fSBarry Smith v = aa + 36*diag[i]; 589f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 590ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 591ef66eb69SBarry Smith x6 = x[5+idx]; 592f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 593f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 594f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 595f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 596f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 597f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 598f1af5d2fSBarry Smith v += 36; 599f1af5d2fSBarry Smith 600f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 601f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 602f1af5d2fSBarry Smith while (nz--) { 603f1af5d2fSBarry Smith oidx = 6*(*vi++); 604f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 605f1af5d2fSBarry Smith x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 606f1af5d2fSBarry Smith x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 607f1af5d2fSBarry Smith x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 608f1af5d2fSBarry Smith x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 609f1af5d2fSBarry Smith x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 610f1af5d2fSBarry Smith v += 36; 611f1af5d2fSBarry Smith } 612f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 613f1af5d2fSBarry Smith x[5+idx] = s6; 614f1af5d2fSBarry Smith idx += 6; 615f1af5d2fSBarry Smith } 616f1af5d2fSBarry Smith /* backward solve the L^T */ 617f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 618f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 619f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 620f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 621f1af5d2fSBarry Smith idt = 6*i; 622f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 623f1af5d2fSBarry Smith s6 = x[5+idt]; 624f1af5d2fSBarry Smith while (nz--) { 625f1af5d2fSBarry Smith idx = 6*(*vi--); 626f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627f1af5d2fSBarry Smith x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628f1af5d2fSBarry Smith x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629f1af5d2fSBarry Smith x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630f1af5d2fSBarry Smith x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631f1af5d2fSBarry Smith x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632f1af5d2fSBarry Smith v -= 36; 633f1af5d2fSBarry Smith } 634f1af5d2fSBarry Smith } 6351ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 6361ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 637dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 638f1af5d2fSBarry Smith PetscFunctionReturn(0); 639f1af5d2fSBarry Smith } 640f1af5d2fSBarry Smith 6414a2ae208SSatish Balay #undef __FUNCT__ 642*8499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct" 643*8499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 644*8499736aSShri Abhyankar { 645*8499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 646*8499736aSShri Abhyankar PetscErrorCode ierr; 647*8499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 648*8499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 649*8499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 650*8499736aSShri Abhyankar MatScalar *aa=a->a,*v; 651*8499736aSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 652*8499736aSShri Abhyankar PetscScalar *x,*b; 653*8499736aSShri Abhyankar 654*8499736aSShri Abhyankar PetscFunctionBegin; 655*8499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 656*8499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 657*8499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 658*8499736aSShri Abhyankar 659*8499736aSShri Abhyankar /* forward solve the U^T */ 660*8499736aSShri Abhyankar idx = 0; 661*8499736aSShri Abhyankar for (i=0; i<n; i++) { 662*8499736aSShri Abhyankar v = aa + bs2*diag[i]; 663*8499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 664*8499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 665*8499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; 666*8499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 667*8499736aSShri Abhyankar s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 668*8499736aSShri Abhyankar s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 669*8499736aSShri Abhyankar s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 670*8499736aSShri Abhyankar s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 671*8499736aSShri Abhyankar s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 672*8499736aSShri Abhyankar v -= bs2; 673*8499736aSShri Abhyankar 674*8499736aSShri Abhyankar vi = aj + diag[i] - 1; 675*8499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 676*8499736aSShri Abhyankar for(j=0;j>-nz;j--){ 677*8499736aSShri Abhyankar oidx = bs*vi[j]; 678*8499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 679*8499736aSShri Abhyankar x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 680*8499736aSShri Abhyankar x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 681*8499736aSShri Abhyankar x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 682*8499736aSShri Abhyankar x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 683*8499736aSShri Abhyankar x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 684*8499736aSShri Abhyankar v -= bs2; 685*8499736aSShri Abhyankar } 686*8499736aSShri Abhyankar x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 687*8499736aSShri Abhyankar x[5+idx] = s6; 688*8499736aSShri Abhyankar idx += bs; 689*8499736aSShri Abhyankar } 690*8499736aSShri Abhyankar /* backward solve the L^T */ 691*8499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 692*8499736aSShri Abhyankar v = aa + bs2*ai[i]; 693*8499736aSShri Abhyankar vi = aj + ai[i]; 694*8499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 695*8499736aSShri Abhyankar idt = bs*i; 696*8499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 697*8499736aSShri Abhyankar s6 = x[5+idt]; 698*8499736aSShri Abhyankar for(j=0;j<nz;j++){ 699*8499736aSShri Abhyankar idx = bs*vi[j]; 700*8499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 701*8499736aSShri Abhyankar x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 702*8499736aSShri Abhyankar x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 703*8499736aSShri Abhyankar x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 704*8499736aSShri Abhyankar x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 705*8499736aSShri Abhyankar x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 706*8499736aSShri Abhyankar v += bs2; 707*8499736aSShri Abhyankar } 708*8499736aSShri Abhyankar } 709*8499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 710*8499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 711*8499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 712*8499736aSShri Abhyankar PetscFunctionReturn(0); 713*8499736aSShri Abhyankar } 714*8499736aSShri Abhyankar 715*8499736aSShri Abhyankar #undef __FUNCT__ 7164a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 717dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 718f1af5d2fSBarry Smith { 719f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 720dfbe8321SBarry Smith PetscErrorCode ierr; 721690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 722690b6cddSBarry Smith PetscInt *diag = a->diag,oidx; 723f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 72487828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 72587828ca2SBarry Smith PetscScalar *x,*b; 726f1af5d2fSBarry Smith 727f1af5d2fSBarry Smith PetscFunctionBegin; 728ef66eb69SBarry Smith ierr = VecCopy(bb,xx);CHKERRQ(ierr); 7291ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 7301ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 731f1af5d2fSBarry Smith 732f1af5d2fSBarry Smith /* forward solve the U^T */ 733f1af5d2fSBarry Smith idx = 0; 734f1af5d2fSBarry Smith for (i=0; i<n; i++) { 735f1af5d2fSBarry Smith 736f1af5d2fSBarry Smith v = aa + 49*diag[i]; 737f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 738ef66eb69SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 739ef66eb69SBarry Smith x6 = x[5+idx]; x7 = x[6+idx]; 740f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 741f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 742f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 743f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 744f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 745f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 746f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 747f1af5d2fSBarry Smith v += 49; 748f1af5d2fSBarry Smith 749f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 750f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 751f1af5d2fSBarry Smith while (nz--) { 752f1af5d2fSBarry Smith oidx = 7*(*vi++); 753f1af5d2fSBarry Smith x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 754f1af5d2fSBarry Smith x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 755f1af5d2fSBarry Smith x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 756f1af5d2fSBarry Smith x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 757f1af5d2fSBarry Smith x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 758f1af5d2fSBarry Smith x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 759f1af5d2fSBarry Smith x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 760f1af5d2fSBarry Smith v += 49; 761f1af5d2fSBarry Smith } 762f1af5d2fSBarry Smith x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 763f1af5d2fSBarry Smith x[5+idx] = s6;x[6+idx] = s7; 764f1af5d2fSBarry Smith idx += 7; 765f1af5d2fSBarry Smith } 766f1af5d2fSBarry Smith /* backward solve the L^T */ 767f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 768f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 769f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 770f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 771f1af5d2fSBarry Smith idt = 7*i; 772f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 773f1af5d2fSBarry Smith s6 = x[5+idt];s7 = x[6+idt]; 774f1af5d2fSBarry Smith while (nz--) { 775f1af5d2fSBarry Smith idx = 7*(*vi--); 776f1af5d2fSBarry Smith x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 777f1af5d2fSBarry Smith x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 778f1af5d2fSBarry Smith x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 779f1af5d2fSBarry Smith x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 780f1af5d2fSBarry Smith x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 781f1af5d2fSBarry Smith x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 782f1af5d2fSBarry Smith x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 783f1af5d2fSBarry Smith v -= 49; 784f1af5d2fSBarry Smith } 785f1af5d2fSBarry Smith } 7861ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 7871ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 788dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 789f1af5d2fSBarry Smith PetscFunctionReturn(0); 790f1af5d2fSBarry Smith } 791*8499736aSShri Abhyankar #undef __FUNCT__ 792*8499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct" 793*8499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 794*8499736aSShri Abhyankar { 795*8499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 796*8499736aSShri Abhyankar PetscErrorCode ierr; 797*8499736aSShri Abhyankar PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 798*8499736aSShri Abhyankar PetscInt nz,idx,idt,j,i,oidx; 799*8499736aSShri Abhyankar PetscInt bs=A->rmap->bs,bs2=a->bs2; 800*8499736aSShri Abhyankar MatScalar *aa=a->a,*v; 801*8499736aSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 802*8499736aSShri Abhyankar PetscScalar *x,*b; 803*8499736aSShri Abhyankar 804*8499736aSShri Abhyankar PetscFunctionBegin; 805*8499736aSShri Abhyankar ierr = VecCopy(bb,xx);CHKERRQ(ierr); 806*8499736aSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 807*8499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 808*8499736aSShri Abhyankar 809*8499736aSShri Abhyankar /* forward solve the U^T */ 810*8499736aSShri Abhyankar idx = 0; 811*8499736aSShri Abhyankar for (i=0; i<n; i++) { 812*8499736aSShri Abhyankar v = aa + bs2*diag[i]; 813*8499736aSShri Abhyankar /* multiply by the inverse of the block diagonal */ 814*8499736aSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 815*8499736aSShri Abhyankar x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 816*8499736aSShri Abhyankar s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 817*8499736aSShri Abhyankar s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 818*8499736aSShri Abhyankar s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 819*8499736aSShri Abhyankar s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 820*8499736aSShri Abhyankar s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 821*8499736aSShri Abhyankar s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 822*8499736aSShri Abhyankar s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 823*8499736aSShri Abhyankar v -= bs2; 824*8499736aSShri Abhyankar vi = aj + diag[i] - 1; 825*8499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 826*8499736aSShri Abhyankar for(j=0;j>-nz;j--){ 827*8499736aSShri Abhyankar oidx = bs*vi[j]; 828*8499736aSShri Abhyankar x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 829*8499736aSShri Abhyankar x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 830*8499736aSShri Abhyankar x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 831*8499736aSShri Abhyankar x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 832*8499736aSShri Abhyankar x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 833*8499736aSShri Abhyankar x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 834*8499736aSShri Abhyankar x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 835*8499736aSShri Abhyankar v -= bs2; 836*8499736aSShri Abhyankar } 837*8499736aSShri Abhyankar x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 838*8499736aSShri Abhyankar x[5+idx] = s6; x[6+idx] = s7; 839*8499736aSShri Abhyankar idx += bs; 840*8499736aSShri Abhyankar } 841*8499736aSShri Abhyankar /* backward solve the L^T */ 842*8499736aSShri Abhyankar for (i=n-1; i>=0; i--){ 843*8499736aSShri Abhyankar v = aa + bs2*ai[i]; 844*8499736aSShri Abhyankar vi = aj + ai[i]; 845*8499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 846*8499736aSShri Abhyankar idt = bs*i; 847*8499736aSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 848*8499736aSShri Abhyankar s6 = x[5+idt]; s7 = x[6+idt]; 849*8499736aSShri Abhyankar for(j=0;j<nz;j++){ 850*8499736aSShri Abhyankar idx = bs*vi[j]; 851*8499736aSShri Abhyankar x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 852*8499736aSShri Abhyankar x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 853*8499736aSShri Abhyankar x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 854*8499736aSShri Abhyankar x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 855*8499736aSShri Abhyankar x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 856*8499736aSShri Abhyankar x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 857*8499736aSShri Abhyankar x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 858*8499736aSShri Abhyankar v += bs2; 859*8499736aSShri Abhyankar } 860*8499736aSShri Abhyankar } 861*8499736aSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 862*8499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 863*8499736aSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 864*8499736aSShri Abhyankar PetscFunctionReturn(0); 865*8499736aSShri Abhyankar } 866f1af5d2fSBarry Smith 867f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/ 8684a2ae208SSatish Balay #undef __FUNCT__ 8694a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 870dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 871f1af5d2fSBarry Smith { 872f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 873f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 8746849ba73SBarry Smith PetscErrorCode ierr; 8755d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 8765d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 877690b6cddSBarry Smith PetscInt *diag = a->diag; 878f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 87987828ca2SBarry Smith PetscScalar s1,*x,*b,*t; 880f1af5d2fSBarry Smith 881f1af5d2fSBarry Smith PetscFunctionBegin; 8821ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 8831ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 884f1af5d2fSBarry Smith t = a->solve_work; 885f1af5d2fSBarry Smith 886f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 887f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 888f1af5d2fSBarry Smith 889f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 890f1af5d2fSBarry Smith for (i=0; i<n; i++) { 891f1af5d2fSBarry Smith t[i] = b[c[i]]; 892f1af5d2fSBarry Smith } 893f1af5d2fSBarry Smith 894f1af5d2fSBarry Smith /* forward solve the U^T */ 895f1af5d2fSBarry Smith for (i=0; i<n; i++) { 896f1af5d2fSBarry Smith 897f1af5d2fSBarry Smith v = aa + diag[i]; 898f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 899f1af5d2fSBarry Smith s1 = (*v++)*t[i]; 900f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 901f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 902f1af5d2fSBarry Smith while (nz--) { 903f1af5d2fSBarry Smith t[*vi++] -= (*v++)*s1; 904f1af5d2fSBarry Smith } 905f1af5d2fSBarry Smith t[i] = s1; 906f1af5d2fSBarry Smith } 907f1af5d2fSBarry Smith /* backward solve the L^T */ 908f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 909f1af5d2fSBarry Smith v = aa + diag[i] - 1; 910f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 911f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 912f1af5d2fSBarry Smith s1 = t[i]; 913f1af5d2fSBarry Smith while (nz--) { 914f1af5d2fSBarry Smith t[*vi--] -= (*v--)*s1; 915f1af5d2fSBarry Smith } 916f1af5d2fSBarry Smith } 917f1af5d2fSBarry Smith 918f1af5d2fSBarry Smith /* copy t into x according to permutation */ 919f1af5d2fSBarry Smith for (i=0; i<n; i++) { 920f1af5d2fSBarry Smith x[r[i]] = t[i]; 921f1af5d2fSBarry Smith } 922f1af5d2fSBarry Smith 923f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 924f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 9251ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 9261ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 927dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 928f1af5d2fSBarry Smith PetscFunctionReturn(0); 929f1af5d2fSBarry Smith } 930f1af5d2fSBarry Smith 9314a2ae208SSatish Balay #undef __FUNCT__ 9324a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 933dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 934f1af5d2fSBarry Smith { 935f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 936f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 9376849ba73SBarry Smith PetscErrorCode ierr; 9385d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 9395d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 940690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 941f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 94287828ca2SBarry Smith PetscScalar s1,s2,x1,x2; 94387828ca2SBarry Smith PetscScalar *x,*b,*t; 944f1af5d2fSBarry Smith 945f1af5d2fSBarry Smith PetscFunctionBegin; 9461ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 9471ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 948f1af5d2fSBarry Smith t = a->solve_work; 949f1af5d2fSBarry Smith 950f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 951f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 952f1af5d2fSBarry Smith 953f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 954f1af5d2fSBarry Smith ii = 0; 955f1af5d2fSBarry Smith for (i=0; i<n; i++) { 956f1af5d2fSBarry Smith ic = 2*c[i]; 957f1af5d2fSBarry Smith t[ii] = b[ic]; 958f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 959f1af5d2fSBarry Smith ii += 2; 960f1af5d2fSBarry Smith } 961f1af5d2fSBarry Smith 962f1af5d2fSBarry Smith /* forward solve the U^T */ 963f1af5d2fSBarry Smith idx = 0; 964f1af5d2fSBarry Smith for (i=0; i<n; i++) { 965f1af5d2fSBarry Smith 966f1af5d2fSBarry Smith v = aa + 4*diag[i]; 967f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 968f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 969f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2; 970f1af5d2fSBarry Smith s2 = v[2]*x1 + v[3]*x2; 971f1af5d2fSBarry Smith v += 4; 972f1af5d2fSBarry Smith 973f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 974f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 975f1af5d2fSBarry Smith while (nz--) { 976f1af5d2fSBarry Smith oidx = 2*(*vi++); 977f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2; 978f1af5d2fSBarry Smith t[oidx+1] -= v[2]*s1 + v[3]*s2; 979f1af5d2fSBarry Smith v += 4; 980f1af5d2fSBarry Smith } 981f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 982f1af5d2fSBarry Smith idx += 2; 983f1af5d2fSBarry Smith } 984f1af5d2fSBarry Smith /* backward solve the L^T */ 985f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 986f1af5d2fSBarry Smith v = aa + 4*diag[i] - 4; 987f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 988f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 989f1af5d2fSBarry Smith idt = 2*i; 990f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 991f1af5d2fSBarry Smith while (nz--) { 992f1af5d2fSBarry Smith idx = 2*(*vi--); 993f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2; 994f1af5d2fSBarry Smith t[idx+1] -= v[2]*s1 + v[3]*s2; 995f1af5d2fSBarry Smith v -= 4; 996f1af5d2fSBarry Smith } 997f1af5d2fSBarry Smith } 998f1af5d2fSBarry Smith 999f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1000f1af5d2fSBarry Smith ii = 0; 1001f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1002f1af5d2fSBarry Smith ir = 2*r[i]; 1003f1af5d2fSBarry Smith x[ir] = t[ii]; 1004f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1005f1af5d2fSBarry Smith ii += 2; 1006f1af5d2fSBarry Smith } 1007f1af5d2fSBarry Smith 1008f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1009f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 10101ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 10111ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1012dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1013f1af5d2fSBarry Smith PetscFunctionReturn(0); 1014f1af5d2fSBarry Smith } 1015f1af5d2fSBarry Smith 10164a2ae208SSatish Balay #undef __FUNCT__ 10174a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 1018dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1019f1af5d2fSBarry Smith { 1020f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1021f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 10226849ba73SBarry Smith PetscErrorCode ierr; 10235d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 10245d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1025690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1026f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 102787828ca2SBarry Smith PetscScalar s1,s2,s3,x1,x2,x3; 102887828ca2SBarry Smith PetscScalar *x,*b,*t; 1029f1af5d2fSBarry Smith 1030f1af5d2fSBarry Smith PetscFunctionBegin; 10311ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 10321ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1033f1af5d2fSBarry Smith t = a->solve_work; 1034f1af5d2fSBarry Smith 1035f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1036f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1037f1af5d2fSBarry Smith 1038f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1039f1af5d2fSBarry Smith ii = 0; 1040f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1041f1af5d2fSBarry Smith ic = 3*c[i]; 1042f1af5d2fSBarry Smith t[ii] = b[ic]; 1043f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1044f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1045f1af5d2fSBarry Smith ii += 3; 1046f1af5d2fSBarry Smith } 1047f1af5d2fSBarry Smith 1048f1af5d2fSBarry Smith /* forward solve the U^T */ 1049f1af5d2fSBarry Smith idx = 0; 1050f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1051f1af5d2fSBarry Smith 1052f1af5d2fSBarry Smith v = aa + 9*diag[i]; 1053f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1054f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1055f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1056f1af5d2fSBarry Smith s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1057f1af5d2fSBarry Smith s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1058f1af5d2fSBarry Smith v += 9; 1059f1af5d2fSBarry Smith 1060f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1061f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1062f1af5d2fSBarry Smith while (nz--) { 1063f1af5d2fSBarry Smith oidx = 3*(*vi++); 1064f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1065f1af5d2fSBarry Smith t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1066f1af5d2fSBarry Smith t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1067f1af5d2fSBarry Smith v += 9; 1068f1af5d2fSBarry Smith } 1069f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1070f1af5d2fSBarry Smith idx += 3; 1071f1af5d2fSBarry Smith } 1072f1af5d2fSBarry Smith /* backward solve the L^T */ 1073f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1074f1af5d2fSBarry Smith v = aa + 9*diag[i] - 9; 1075f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1076f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1077f1af5d2fSBarry Smith idt = 3*i; 1078f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1079f1af5d2fSBarry Smith while (nz--) { 1080f1af5d2fSBarry Smith idx = 3*(*vi--); 1081f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1082f1af5d2fSBarry Smith t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1083f1af5d2fSBarry Smith t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1084f1af5d2fSBarry Smith v -= 9; 1085f1af5d2fSBarry Smith } 1086f1af5d2fSBarry Smith } 1087f1af5d2fSBarry Smith 1088f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1089f1af5d2fSBarry Smith ii = 0; 1090f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1091f1af5d2fSBarry Smith ir = 3*r[i]; 1092f1af5d2fSBarry Smith x[ir] = t[ii]; 1093f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1094f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1095f1af5d2fSBarry Smith ii += 3; 1096f1af5d2fSBarry Smith } 1097f1af5d2fSBarry Smith 1098f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1099f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11001ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11011ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1102dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1103f1af5d2fSBarry Smith PetscFunctionReturn(0); 1104f1af5d2fSBarry Smith } 1105f1af5d2fSBarry Smith 11064a2ae208SSatish Balay #undef __FUNCT__ 11074a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 1108dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1109f1af5d2fSBarry Smith { 1110f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1111f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 11126849ba73SBarry Smith PetscErrorCode ierr; 11135d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 11145d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1115690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1116f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 111787828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 111887828ca2SBarry Smith PetscScalar *x,*b,*t; 1119f1af5d2fSBarry Smith 1120f1af5d2fSBarry Smith PetscFunctionBegin; 11211ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 11221ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1123f1af5d2fSBarry Smith t = a->solve_work; 1124f1af5d2fSBarry Smith 1125f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1126f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1127f1af5d2fSBarry Smith 1128f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1129f1af5d2fSBarry Smith ii = 0; 1130f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1131f1af5d2fSBarry Smith ic = 4*c[i]; 1132f1af5d2fSBarry Smith t[ii] = b[ic]; 1133f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1134f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1135f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1136f1af5d2fSBarry Smith ii += 4; 1137f1af5d2fSBarry Smith } 1138f1af5d2fSBarry Smith 1139f1af5d2fSBarry Smith /* forward solve the U^T */ 1140f1af5d2fSBarry Smith idx = 0; 1141f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1142f1af5d2fSBarry Smith 1143f1af5d2fSBarry Smith v = aa + 16*diag[i]; 1144f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1145f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1146f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1147f1af5d2fSBarry Smith s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1148f1af5d2fSBarry Smith s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1149f1af5d2fSBarry Smith s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1150f1af5d2fSBarry Smith v += 16; 1151f1af5d2fSBarry Smith 1152f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1153f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1154f1af5d2fSBarry Smith while (nz--) { 1155f1af5d2fSBarry Smith oidx = 4*(*vi++); 1156f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1157f1af5d2fSBarry Smith t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1158f1af5d2fSBarry Smith t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1159f1af5d2fSBarry Smith t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1160f1af5d2fSBarry Smith v += 16; 1161f1af5d2fSBarry Smith } 1162f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1163f1af5d2fSBarry Smith idx += 4; 1164f1af5d2fSBarry Smith } 1165f1af5d2fSBarry Smith /* backward solve the L^T */ 1166f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1167f1af5d2fSBarry Smith v = aa + 16*diag[i] - 16; 1168f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1169f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1170f1af5d2fSBarry Smith idt = 4*i; 1171f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1172f1af5d2fSBarry Smith while (nz--) { 1173f1af5d2fSBarry Smith idx = 4*(*vi--); 1174f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1175f1af5d2fSBarry Smith t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1176f1af5d2fSBarry Smith t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1177f1af5d2fSBarry Smith t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1178f1af5d2fSBarry Smith v -= 16; 1179f1af5d2fSBarry Smith } 1180f1af5d2fSBarry Smith } 1181f1af5d2fSBarry Smith 1182f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1183f1af5d2fSBarry Smith ii = 0; 1184f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1185f1af5d2fSBarry Smith ir = 4*r[i]; 1186f1af5d2fSBarry Smith x[ir] = t[ii]; 1187f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1188f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1189f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1190f1af5d2fSBarry Smith ii += 4; 1191f1af5d2fSBarry Smith } 1192f1af5d2fSBarry Smith 1193f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1194f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 11951ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 11961ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1197dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1198f1af5d2fSBarry Smith PetscFunctionReturn(0); 1199f1af5d2fSBarry Smith } 1200f1af5d2fSBarry Smith 12014a2ae208SSatish Balay #undef __FUNCT__ 12024a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 1203dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1204f1af5d2fSBarry Smith { 1205f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1206f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 12076849ba73SBarry Smith PetscErrorCode ierr; 12085d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 12095d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1210690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1211f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 121287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 121387828ca2SBarry Smith PetscScalar *x,*b,*t; 1214f1af5d2fSBarry Smith 1215f1af5d2fSBarry Smith PetscFunctionBegin; 12161ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 12171ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1218f1af5d2fSBarry Smith t = a->solve_work; 1219f1af5d2fSBarry Smith 1220f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1221f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1222f1af5d2fSBarry Smith 1223f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1224f1af5d2fSBarry Smith ii = 0; 1225f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1226f1af5d2fSBarry Smith ic = 5*c[i]; 1227f1af5d2fSBarry Smith t[ii] = b[ic]; 1228f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1229f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1230f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1231f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1232f1af5d2fSBarry Smith ii += 5; 1233f1af5d2fSBarry Smith } 1234f1af5d2fSBarry Smith 1235f1af5d2fSBarry Smith /* forward solve the U^T */ 1236f1af5d2fSBarry Smith idx = 0; 1237f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1238f1af5d2fSBarry Smith 1239f1af5d2fSBarry Smith v = aa + 25*diag[i]; 1240f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1241f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1242f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1243f1af5d2fSBarry Smith s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1244f1af5d2fSBarry Smith s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1245f1af5d2fSBarry Smith s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1246f1af5d2fSBarry Smith s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1247f1af5d2fSBarry Smith v += 25; 1248f1af5d2fSBarry Smith 1249f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1250f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1251f1af5d2fSBarry Smith while (nz--) { 1252f1af5d2fSBarry Smith oidx = 5*(*vi++); 1253f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1254f1af5d2fSBarry Smith t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1255f1af5d2fSBarry Smith t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1256f1af5d2fSBarry Smith t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1257f1af5d2fSBarry Smith t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1258f1af5d2fSBarry Smith v += 25; 1259f1af5d2fSBarry Smith } 1260f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1261f1af5d2fSBarry Smith idx += 5; 1262f1af5d2fSBarry Smith } 1263f1af5d2fSBarry Smith /* backward solve the L^T */ 1264f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1265f1af5d2fSBarry Smith v = aa + 25*diag[i] - 25; 1266f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1267f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1268f1af5d2fSBarry Smith idt = 5*i; 1269f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1270f1af5d2fSBarry Smith while (nz--) { 1271f1af5d2fSBarry Smith idx = 5*(*vi--); 1272f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1273f1af5d2fSBarry Smith t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1274f1af5d2fSBarry Smith t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1275f1af5d2fSBarry Smith t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1276f1af5d2fSBarry Smith t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1277f1af5d2fSBarry Smith v -= 25; 1278f1af5d2fSBarry Smith } 1279f1af5d2fSBarry Smith } 1280f1af5d2fSBarry Smith 1281f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1282f1af5d2fSBarry Smith ii = 0; 1283f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1284f1af5d2fSBarry Smith ir = 5*r[i]; 1285f1af5d2fSBarry Smith x[ir] = t[ii]; 1286f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1287f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1288f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1289f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1290f1af5d2fSBarry Smith ii += 5; 1291f1af5d2fSBarry Smith } 1292f1af5d2fSBarry Smith 1293f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1294f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 12951ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 12961ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1297dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1298f1af5d2fSBarry Smith PetscFunctionReturn(0); 1299f1af5d2fSBarry Smith } 1300f1af5d2fSBarry Smith 13014a2ae208SSatish Balay #undef __FUNCT__ 13024a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 1303dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1304f1af5d2fSBarry Smith { 1305f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1306f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 13076849ba73SBarry Smith PetscErrorCode ierr; 13085d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 13095d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1310690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1311f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 131287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 131387828ca2SBarry Smith PetscScalar *x,*b,*t; 1314f1af5d2fSBarry Smith 1315f1af5d2fSBarry Smith PetscFunctionBegin; 13161ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 13171ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1318f1af5d2fSBarry Smith t = a->solve_work; 1319f1af5d2fSBarry Smith 1320f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1321f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1322f1af5d2fSBarry Smith 1323f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1324f1af5d2fSBarry Smith ii = 0; 1325f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1326f1af5d2fSBarry Smith ic = 6*c[i]; 1327f1af5d2fSBarry Smith t[ii] = b[ic]; 1328f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1329f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1330f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1331f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1332f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1333f1af5d2fSBarry Smith ii += 6; 1334f1af5d2fSBarry Smith } 1335f1af5d2fSBarry Smith 1336f1af5d2fSBarry Smith /* forward solve the U^T */ 1337f1af5d2fSBarry Smith idx = 0; 1338f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1339f1af5d2fSBarry Smith 1340f1af5d2fSBarry Smith v = aa + 36*diag[i]; 1341f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1342f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1343f1af5d2fSBarry Smith x6 = t[5+idx]; 1344f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1345f1af5d2fSBarry Smith s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1346f1af5d2fSBarry Smith s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1347f1af5d2fSBarry Smith s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1348f1af5d2fSBarry Smith s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1349f1af5d2fSBarry Smith s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1350f1af5d2fSBarry Smith v += 36; 1351f1af5d2fSBarry Smith 1352f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1353f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1354f1af5d2fSBarry Smith while (nz--) { 1355f1af5d2fSBarry Smith oidx = 6*(*vi++); 1356f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1357f1af5d2fSBarry Smith t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1358f1af5d2fSBarry Smith t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1359f1af5d2fSBarry Smith t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1360f1af5d2fSBarry Smith t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1361f1af5d2fSBarry Smith t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1362f1af5d2fSBarry Smith v += 36; 1363f1af5d2fSBarry Smith } 1364f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1365f1af5d2fSBarry Smith t[5+idx] = s6; 1366f1af5d2fSBarry Smith idx += 6; 1367f1af5d2fSBarry Smith } 1368f1af5d2fSBarry Smith /* backward solve the L^T */ 1369f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1370f1af5d2fSBarry Smith v = aa + 36*diag[i] - 36; 1371f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1372f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1373f1af5d2fSBarry Smith idt = 6*i; 1374f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1375f1af5d2fSBarry Smith s6 = t[5+idt]; 1376f1af5d2fSBarry Smith while (nz--) { 1377f1af5d2fSBarry Smith idx = 6*(*vi--); 1378f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1379f1af5d2fSBarry Smith t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1380f1af5d2fSBarry Smith t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1381f1af5d2fSBarry Smith t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1382f1af5d2fSBarry Smith t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1383f1af5d2fSBarry Smith t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1384f1af5d2fSBarry Smith v -= 36; 1385f1af5d2fSBarry Smith } 1386f1af5d2fSBarry Smith } 1387f1af5d2fSBarry Smith 1388f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1389f1af5d2fSBarry Smith ii = 0; 1390f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1391f1af5d2fSBarry Smith ir = 6*r[i]; 1392f1af5d2fSBarry Smith x[ir] = t[ii]; 1393f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1394f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1395f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1396f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1397f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1398f1af5d2fSBarry Smith ii += 6; 1399f1af5d2fSBarry Smith } 1400f1af5d2fSBarry Smith 1401f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1402f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 14031ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 14041ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1405dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1406f1af5d2fSBarry Smith PetscFunctionReturn(0); 1407f1af5d2fSBarry Smith } 1408f1af5d2fSBarry Smith 14094a2ae208SSatish Balay #undef __FUNCT__ 14104a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1411dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1412f1af5d2fSBarry Smith { 1413f1af5d2fSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1414f1af5d2fSBarry Smith IS iscol=a->col,isrow=a->row; 14156849ba73SBarry Smith PetscErrorCode ierr; 14165d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 14175d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1418690b6cddSBarry Smith PetscInt *diag = a->diag,ii,ic,ir,oidx; 1419f1af5d2fSBarry Smith MatScalar *aa=a->a,*v; 142087828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 142187828ca2SBarry Smith PetscScalar *x,*b,*t; 1422f1af5d2fSBarry Smith 1423f1af5d2fSBarry Smith PetscFunctionBegin; 14241ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 14251ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1426f1af5d2fSBarry Smith t = a->solve_work; 1427f1af5d2fSBarry Smith 1428f1af5d2fSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1429f1af5d2fSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1430f1af5d2fSBarry Smith 1431f1af5d2fSBarry Smith /* copy the b into temp work space according to permutation */ 1432f1af5d2fSBarry Smith ii = 0; 1433f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1434f1af5d2fSBarry Smith ic = 7*c[i]; 1435f1af5d2fSBarry Smith t[ii] = b[ic]; 1436f1af5d2fSBarry Smith t[ii+1] = b[ic+1]; 1437f1af5d2fSBarry Smith t[ii+2] = b[ic+2]; 1438f1af5d2fSBarry Smith t[ii+3] = b[ic+3]; 1439f1af5d2fSBarry Smith t[ii+4] = b[ic+4]; 1440f1af5d2fSBarry Smith t[ii+5] = b[ic+5]; 1441f1af5d2fSBarry Smith t[ii+6] = b[ic+6]; 1442f1af5d2fSBarry Smith ii += 7; 1443f1af5d2fSBarry Smith } 1444f1af5d2fSBarry Smith 1445f1af5d2fSBarry Smith /* forward solve the U^T */ 1446f1af5d2fSBarry Smith idx = 0; 1447f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1448f1af5d2fSBarry Smith 1449f1af5d2fSBarry Smith v = aa + 49*diag[i]; 1450f1af5d2fSBarry Smith /* multiply by the inverse of the block diagonal */ 1451f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1452f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1453f1af5d2fSBarry Smith s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1454f1af5d2fSBarry Smith s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1455f1af5d2fSBarry Smith s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1456f1af5d2fSBarry Smith s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1457f1af5d2fSBarry Smith s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1458f1af5d2fSBarry Smith s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1459f1af5d2fSBarry Smith s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1460f1af5d2fSBarry Smith v += 49; 1461f1af5d2fSBarry Smith 1462f1af5d2fSBarry Smith vi = aj + diag[i] + 1; 1463f1af5d2fSBarry Smith nz = ai[i+1] - diag[i] - 1; 1464f1af5d2fSBarry Smith while (nz--) { 1465f1af5d2fSBarry Smith oidx = 7*(*vi++); 1466f1af5d2fSBarry Smith t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1467f1af5d2fSBarry Smith t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1468f1af5d2fSBarry Smith t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1469f1af5d2fSBarry Smith t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1470f1af5d2fSBarry Smith t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1471f1af5d2fSBarry Smith t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1472f1af5d2fSBarry Smith t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1473f1af5d2fSBarry Smith v += 49; 1474f1af5d2fSBarry Smith } 1475f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1476f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 1477f1af5d2fSBarry Smith idx += 7; 1478f1af5d2fSBarry Smith } 1479f1af5d2fSBarry Smith /* backward solve the L^T */ 1480f1af5d2fSBarry Smith for (i=n-1; i>=0; i--){ 1481f1af5d2fSBarry Smith v = aa + 49*diag[i] - 49; 1482f1af5d2fSBarry Smith vi = aj + diag[i] - 1; 1483f1af5d2fSBarry Smith nz = diag[i] - ai[i]; 1484f1af5d2fSBarry Smith idt = 7*i; 1485f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1486f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 1487f1af5d2fSBarry Smith while (nz--) { 1488f1af5d2fSBarry Smith idx = 7*(*vi--); 1489f1af5d2fSBarry Smith t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1490f1af5d2fSBarry Smith t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1491f1af5d2fSBarry Smith t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1492f1af5d2fSBarry Smith t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1493f1af5d2fSBarry Smith t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1494f1af5d2fSBarry Smith t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1495f1af5d2fSBarry Smith t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1496f1af5d2fSBarry Smith v -= 49; 1497f1af5d2fSBarry Smith } 1498f1af5d2fSBarry Smith } 1499f1af5d2fSBarry Smith 1500f1af5d2fSBarry Smith /* copy t into x according to permutation */ 1501f1af5d2fSBarry Smith ii = 0; 1502f1af5d2fSBarry Smith for (i=0; i<n; i++) { 1503f1af5d2fSBarry Smith ir = 7*r[i]; 1504f1af5d2fSBarry Smith x[ir] = t[ii]; 1505f1af5d2fSBarry Smith x[ir+1] = t[ii+1]; 1506f1af5d2fSBarry Smith x[ir+2] = t[ii+2]; 1507f1af5d2fSBarry Smith x[ir+3] = t[ii+3]; 1508f1af5d2fSBarry Smith x[ir+4] = t[ii+4]; 1509f1af5d2fSBarry Smith x[ir+5] = t[ii+5]; 1510f1af5d2fSBarry Smith x[ir+6] = t[ii+6]; 1511f1af5d2fSBarry Smith ii += 7; 1512f1af5d2fSBarry Smith } 1513f1af5d2fSBarry Smith 1514f1af5d2fSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1515f1af5d2fSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 15161ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 15171ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1518dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1519f1af5d2fSBarry Smith PetscFunctionReturn(0); 1520f1af5d2fSBarry Smith } 1521f1af5d2fSBarry Smith 15224e2b4712SSatish Balay /* ----------------------------------------------------------- */ 15234a2ae208SSatish Balay #undef __FUNCT__ 15244a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1525dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 15264e2b4712SSatish Balay { 15274e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15284e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 15296849ba73SBarry Smith PetscErrorCode ierr; 15305d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 15315d0c19d7SBarry Smith PetscInt i,n=a->mbs; 15325d0c19d7SBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 15333f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 153487828ca2SBarry Smith PetscScalar *x,*b,*s,*t,*ls; 15354e2b4712SSatish Balay 15364e2b4712SSatish Balay PetscFunctionBegin; 15371ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 15381ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1539f1af5d2fSBarry Smith t = a->solve_work; 15404e2b4712SSatish Balay 15414e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 15424e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 15434e2b4712SSatish Balay 15444e2b4712SSatish Balay /* forward solve the lower triangular */ 154587828ca2SBarry Smith ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 15464e2b4712SSatish Balay for (i=1; i<n; i++) { 15474e2b4712SSatish Balay v = aa + bs2*ai[i]; 15484e2b4712SSatish Balay vi = aj + ai[i]; 15494e2b4712SSatish Balay nz = a->diag[i] - ai[i]; 1550f1af5d2fSBarry Smith s = t + bs*i; 155187828ca2SBarry Smith ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 15524e2b4712SSatish Balay while (nz--) { 1553f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 15544e2b4712SSatish Balay v += bs2; 15554e2b4712SSatish Balay } 15564e2b4712SSatish Balay } 15574e2b4712SSatish Balay /* backward solve the upper triangular */ 1558d0f46423SBarry Smith ls = a->solve_work + A->cmap->n; 15594e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 15604e2b4712SSatish Balay v = aa + bs2*(a->diag[i] + 1); 15614e2b4712SSatish Balay vi = aj + a->diag[i] + 1; 15624e2b4712SSatish Balay nz = ai[i+1] - a->diag[i] - 1; 156387828ca2SBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 15644e2b4712SSatish Balay while (nz--) { 1565f1af5d2fSBarry Smith Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 15664e2b4712SSatish Balay v += bs2; 15674e2b4712SSatish Balay } 1568f1af5d2fSBarry Smith Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 156987828ca2SBarry Smith ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 15704e2b4712SSatish Balay } 15714e2b4712SSatish Balay 15724e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 15734e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 15741ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 15751ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1576dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 15774e2b4712SSatish Balay PetscFunctionReturn(0); 15784e2b4712SSatish Balay } 15794e2b4712SSatish Balay 15805c42ef9dSBarry Smith /* ----------------------------------------------------------- */ 15815c42ef9dSBarry Smith #undef __FUNCT__ 15825c42ef9dSBarry Smith #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 15835c42ef9dSBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 15845c42ef9dSBarry Smith { 15855c42ef9dSBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15865c42ef9dSBarry Smith IS iscol=a->col,isrow=a->row; 15875c42ef9dSBarry Smith PetscErrorCode ierr; 15885c42ef9dSBarry Smith const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 15895c42ef9dSBarry Smith PetscInt i,n=a->mbs,j; 15905c42ef9dSBarry Smith PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 15915c42ef9dSBarry Smith const MatScalar *aa=a->a,*v; 15925c42ef9dSBarry Smith PetscScalar *x,*t,*ls; 15935c42ef9dSBarry Smith const PetscScalar *b; 15945c42ef9dSBarry Smith PetscFunctionBegin; 15955c42ef9dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 15965c42ef9dSBarry Smith ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 15975c42ef9dSBarry Smith t = a->solve_work; 15985c42ef9dSBarry Smith 15995c42ef9dSBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 16005c42ef9dSBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 16015c42ef9dSBarry Smith 16025c42ef9dSBarry Smith /* copy the b into temp work space according to permutation */ 16035c42ef9dSBarry Smith for (i=0; i<n; i++) { 16045c42ef9dSBarry Smith for (j=0; j<bs; j++) { 16055c42ef9dSBarry Smith t[i*bs+j] = b[c[i]*bs+j]; 16065c42ef9dSBarry Smith } 16075c42ef9dSBarry Smith } 16085c42ef9dSBarry Smith 16095c42ef9dSBarry Smith 16105c42ef9dSBarry Smith /* forward solve the upper triangular transpose */ 16115c42ef9dSBarry Smith ls = a->solve_work + A->cmap->n; 16125c42ef9dSBarry Smith for (i=0; i<n; i++){ 16135c42ef9dSBarry Smith ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 16145c42ef9dSBarry Smith Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 16155c42ef9dSBarry Smith v = aa + bs2*(a->diag[i] + 1); 16165c42ef9dSBarry Smith vi = aj + a->diag[i] + 1; 16175c42ef9dSBarry Smith nz = ai[i+1] - a->diag[i] - 1; 16185c42ef9dSBarry Smith while (nz--) { 16195c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 16205c42ef9dSBarry Smith v += bs2; 16215c42ef9dSBarry Smith } 16225c42ef9dSBarry Smith } 16235c42ef9dSBarry Smith 16245c42ef9dSBarry Smith /* backward solve the lower triangular transpose */ 16255c42ef9dSBarry Smith for (i=n-1; i>=0; i--) { 16265c42ef9dSBarry Smith v = aa + bs2*ai[i]; 16275c42ef9dSBarry Smith vi = aj + ai[i]; 16285c42ef9dSBarry Smith nz = a->diag[i] - ai[i]; 16295c42ef9dSBarry Smith while (nz--) { 16305c42ef9dSBarry Smith Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 16315c42ef9dSBarry Smith v += bs2; 16325c42ef9dSBarry Smith } 16335c42ef9dSBarry Smith } 16345c42ef9dSBarry Smith 16355c42ef9dSBarry Smith /* copy t into x according to permutation */ 16365c42ef9dSBarry Smith for (i=0; i<n; i++) { 16375c42ef9dSBarry Smith for (j=0; j<bs; j++) { 16385c42ef9dSBarry Smith x[bs*r[i]+j] = t[bs*i+j]; 16395c42ef9dSBarry Smith } 16405c42ef9dSBarry Smith } 16415c42ef9dSBarry Smith 16425c42ef9dSBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 16435c42ef9dSBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 16445c42ef9dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 16455c42ef9dSBarry Smith ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 16465c42ef9dSBarry Smith ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 16475c42ef9dSBarry Smith PetscFunctionReturn(0); 16485c42ef9dSBarry Smith } 16495c42ef9dSBarry Smith 16504a2ae208SSatish Balay #undef __FUNCT__ 1651*8499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_newdatastruct" 1652*8499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N_newdatastruct(Mat A,Vec bb,Vec xx) 1653*8499736aSShri Abhyankar { 1654*8499736aSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1655*8499736aSShri Abhyankar IS iscol=a->col,isrow=a->row; 1656*8499736aSShri Abhyankar PetscErrorCode ierr; 1657*8499736aSShri Abhyankar const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 1658*8499736aSShri Abhyankar PetscInt i,n=a->mbs,j; 1659*8499736aSShri Abhyankar PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1660*8499736aSShri Abhyankar const MatScalar *aa=a->a,*v; 1661*8499736aSShri Abhyankar PetscScalar *x,*t,*ls; 1662*8499736aSShri Abhyankar const PetscScalar *b; 1663*8499736aSShri Abhyankar PetscFunctionBegin; 1664*8499736aSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1665*8499736aSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1666*8499736aSShri Abhyankar t = a->solve_work; 1667*8499736aSShri Abhyankar 1668*8499736aSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1669*8499736aSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1670*8499736aSShri Abhyankar 1671*8499736aSShri Abhyankar /* copy the b into temp work space according to permutation */ 1672*8499736aSShri Abhyankar for (i=0; i<n; i++) { 1673*8499736aSShri Abhyankar for (j=0; j<bs; j++) { 1674*8499736aSShri Abhyankar t[i*bs+j] = b[c[i]*bs+j]; 1675*8499736aSShri Abhyankar } 1676*8499736aSShri Abhyankar } 1677*8499736aSShri Abhyankar 1678*8499736aSShri Abhyankar 1679*8499736aSShri Abhyankar /* forward solve the upper triangular transpose */ 1680*8499736aSShri Abhyankar ls = a->solve_work + A->cmap->n; 1681*8499736aSShri Abhyankar for (i=0; i<n; i++){ 1682*8499736aSShri Abhyankar ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1683*8499736aSShri Abhyankar Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 1684*8499736aSShri Abhyankar v = aa + bs2*(diag[i] - 1); 1685*8499736aSShri Abhyankar vi = aj + diag[i] - 1; 1686*8499736aSShri Abhyankar nz = diag[i] - diag[i+1] - 1; 1687*8499736aSShri Abhyankar for(j=0;j>-nz;j--){ 1688*8499736aSShri Abhyankar Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 1689*8499736aSShri Abhyankar v -= bs2; 1690*8499736aSShri Abhyankar } 1691*8499736aSShri Abhyankar } 1692*8499736aSShri Abhyankar 1693*8499736aSShri Abhyankar /* backward solve the lower triangular transpose */ 1694*8499736aSShri Abhyankar for (i=n-1; i>=0; i--) { 1695*8499736aSShri Abhyankar v = aa + bs2*ai[i]; 1696*8499736aSShri Abhyankar vi = aj + ai[i]; 1697*8499736aSShri Abhyankar nz = ai[i+1] - ai[i]; 1698*8499736aSShri Abhyankar for(j=0;j<nz;j++){ 1699*8499736aSShri Abhyankar Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 1700*8499736aSShri Abhyankar v += bs2; 1701*8499736aSShri Abhyankar } 1702*8499736aSShri Abhyankar } 1703*8499736aSShri Abhyankar 1704*8499736aSShri Abhyankar /* copy t into x according to permutation */ 1705*8499736aSShri Abhyankar for (i=0; i<n; i++) { 1706*8499736aSShri Abhyankar for (j=0; j<bs; j++) { 1707*8499736aSShri Abhyankar x[bs*r[i]+j] = t[bs*i+j]; 1708*8499736aSShri Abhyankar } 1709*8499736aSShri Abhyankar } 1710*8499736aSShri Abhyankar 1711*8499736aSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1712*8499736aSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1713*8499736aSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1714*8499736aSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1715*8499736aSShri Abhyankar ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1716*8499736aSShri Abhyankar PetscFunctionReturn(0); 1717*8499736aSShri Abhyankar } 1718*8499736aSShri Abhyankar 1719*8499736aSShri Abhyankar #undef __FUNCT__ 17204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1721dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 17224e2b4712SSatish Balay { 17234e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 17244e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 17256849ba73SBarry Smith PetscErrorCode ierr; 17265d0c19d7SBarry Smith const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 17275d0c19d7SBarry Smith PetscInt i,n=a->mbs,nz,idx,idt,idc; 17283f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 172987828ca2SBarry Smith PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 173087828ca2SBarry Smith PetscScalar *x,*b,*t; 17314e2b4712SSatish Balay 17324e2b4712SSatish Balay PetscFunctionBegin; 17331ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 17341ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1735f1af5d2fSBarry Smith t = a->solve_work; 17364e2b4712SSatish Balay 17374e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 17384e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 17394e2b4712SSatish Balay 17404e2b4712SSatish Balay /* forward solve the lower triangular */ 17414e2b4712SSatish Balay idx = 7*(*r++); 1742f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 1743f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1744f1af5d2fSBarry Smith t[5] = b[5+idx]; t[6] = b[6+idx]; 17454e2b4712SSatish Balay 17464e2b4712SSatish Balay for (i=1; i<n; i++) { 17474e2b4712SSatish Balay v = aa + 49*ai[i]; 17484e2b4712SSatish Balay vi = aj + ai[i]; 17494e2b4712SSatish Balay nz = diag[i] - ai[i]; 17504e2b4712SSatish Balay idx = 7*(*r++); 1751f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1752f1af5d2fSBarry Smith s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 17534e2b4712SSatish Balay while (nz--) { 17544e2b4712SSatish Balay idx = 7*(*vi++); 1755f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1756f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 1757f1af5d2fSBarry Smith x6 = t[5+idx];x7 = t[6+idx]; 1758f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1759f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1760f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1761f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1762f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1763f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1764f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 17654e2b4712SSatish Balay v += 49; 17664e2b4712SSatish Balay } 17674e2b4712SSatish Balay idx = 7*i; 1768f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 1769f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1770f1af5d2fSBarry Smith t[5+idx] = s6;t[6+idx] = s7; 17714e2b4712SSatish Balay } 17724e2b4712SSatish Balay /* backward solve the upper triangular */ 17734e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 17744e2b4712SSatish Balay v = aa + 49*diag[i] + 49; 17754e2b4712SSatish Balay vi = aj + diag[i] + 1; 17764e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 17774e2b4712SSatish Balay idt = 7*i; 1778f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 1779f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1780f1af5d2fSBarry Smith s6 = t[5+idt];s7 = t[6+idt]; 17814e2b4712SSatish Balay while (nz--) { 17824e2b4712SSatish Balay idx = 7*(*vi++); 1783f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 1784f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1785f1af5d2fSBarry Smith x6 = t[5+idx]; x7 = t[6+idx]; 1786f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1787f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1788f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1789f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1790f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1791f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1792f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 17934e2b4712SSatish Balay v += 49; 17944e2b4712SSatish Balay } 17954e2b4712SSatish Balay idc = 7*(*c--); 17964e2b4712SSatish Balay v = aa + 49*diag[i]; 1797f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1798f1af5d2fSBarry Smith v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1799f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1800f1af5d2fSBarry Smith v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1801f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1802f1af5d2fSBarry Smith v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1803f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1804f1af5d2fSBarry Smith v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1805f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1806f1af5d2fSBarry Smith v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1807f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1808f1af5d2fSBarry Smith v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1809f1af5d2fSBarry Smith x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1810f1af5d2fSBarry Smith v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 18114e2b4712SSatish Balay } 18124e2b4712SSatish Balay 18134e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 18144e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 18151ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 18161ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1817dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 18184e2b4712SSatish Balay PetscFunctionReturn(0); 18194e2b4712SSatish Balay } 18204e2b4712SSatish Balay 18218f690400SShri Abhyankar #undef __FUNCT__ 1822a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 1823a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 182435aa4fcfSShri Abhyankar { 182535aa4fcfSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 182635aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 182735aa4fcfSShri Abhyankar PetscErrorCode ierr; 182835aa4fcfSShri Abhyankar const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 182935aa4fcfSShri Abhyankar PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 183035aa4fcfSShri Abhyankar MatScalar *aa=a->a,*v; 183135aa4fcfSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 183235aa4fcfSShri Abhyankar PetscScalar *x,*b,*t; 183335aa4fcfSShri Abhyankar 183435aa4fcfSShri Abhyankar PetscFunctionBegin; 183535aa4fcfSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 183635aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 183735aa4fcfSShri Abhyankar t = a->solve_work; 183835aa4fcfSShri Abhyankar 183935aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 184035aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 184135aa4fcfSShri Abhyankar 184235aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 184335aa4fcfSShri Abhyankar idx = 7*r[0]; 184435aa4fcfSShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 184535aa4fcfSShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 184635aa4fcfSShri Abhyankar t[5] = b[5+idx]; t[6] = b[6+idx]; 184735aa4fcfSShri Abhyankar 184835aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 184935aa4fcfSShri Abhyankar v = aa + 49*ai[i]; 185035aa4fcfSShri Abhyankar vi = aj + ai[i]; 185135aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 185235aa4fcfSShri Abhyankar idx = 7*r[i]; 185335aa4fcfSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 185435aa4fcfSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 185535aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 185635aa4fcfSShri Abhyankar idx = 7*vi[m]; 185735aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 185835aa4fcfSShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 185935aa4fcfSShri Abhyankar x6 = t[5+idx];x7 = t[6+idx]; 186035aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 186135aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 186235aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 186335aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 186435aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 186535aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 186635aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 186735aa4fcfSShri Abhyankar v += 49; 186835aa4fcfSShri Abhyankar } 186935aa4fcfSShri Abhyankar idx = 7*i; 187035aa4fcfSShri Abhyankar t[idx] = s1;t[1+idx] = s2; 187135aa4fcfSShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 187235aa4fcfSShri Abhyankar t[5+idx] = s6;t[6+idx] = s7; 187335aa4fcfSShri Abhyankar } 187435aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 187535aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 187635aa4fcfSShri Abhyankar v = aa + 49*(adiag[i+1]+1); 187735aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 187835aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 187935aa4fcfSShri Abhyankar idt = 7*i; 188035aa4fcfSShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 188135aa4fcfSShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 188235aa4fcfSShri Abhyankar s6 = t[5+idt];s7 = t[6+idt]; 188335aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 188435aa4fcfSShri Abhyankar idx = 7*vi[m]; 188535aa4fcfSShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 188635aa4fcfSShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 188735aa4fcfSShri Abhyankar x6 = t[5+idx]; x7 = t[6+idx]; 188835aa4fcfSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 188935aa4fcfSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 189035aa4fcfSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 189135aa4fcfSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 189235aa4fcfSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 189335aa4fcfSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 189435aa4fcfSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 189535aa4fcfSShri Abhyankar v += 49; 189635aa4fcfSShri Abhyankar } 189735aa4fcfSShri Abhyankar idc = 7*c[i]; 189835aa4fcfSShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 189935aa4fcfSShri Abhyankar v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 190035aa4fcfSShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 190135aa4fcfSShri Abhyankar v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 190235aa4fcfSShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 190335aa4fcfSShri Abhyankar v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 190435aa4fcfSShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 190535aa4fcfSShri Abhyankar v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 190635aa4fcfSShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 190735aa4fcfSShri Abhyankar v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 190835aa4fcfSShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 190935aa4fcfSShri Abhyankar v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 191035aa4fcfSShri Abhyankar x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 191135aa4fcfSShri Abhyankar v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 191235aa4fcfSShri Abhyankar } 191335aa4fcfSShri Abhyankar 191435aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 191535aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 191635aa4fcfSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 191735aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 191835aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 191935aa4fcfSShri Abhyankar PetscFunctionReturn(0); 192035aa4fcfSShri Abhyankar } 192135aa4fcfSShri Abhyankar 192235aa4fcfSShri Abhyankar #undef __FUNCT__ 19234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1924dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 192515091d37SBarry Smith { 192615091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1927690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1928dfbe8321SBarry Smith PetscErrorCode ierr; 1929690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 1930d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 1931d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1932d9fead3dSBarry Smith const PetscScalar *b; 193315091d37SBarry Smith 193415091d37SBarry Smith PetscFunctionBegin; 1935d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 19361ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 193715091d37SBarry Smith /* forward solve the lower triangular */ 193815091d37SBarry Smith idx = 0; 193915091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 194015091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 194115091d37SBarry Smith x[6] = b[6+idx]; 194215091d37SBarry Smith for (i=1; i<n; i++) { 194315091d37SBarry Smith v = aa + 49*ai[i]; 194415091d37SBarry Smith vi = aj + ai[i]; 194515091d37SBarry Smith nz = diag[i] - ai[i]; 194615091d37SBarry Smith idx = 7*i; 1947f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1948f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1949f1af5d2fSBarry Smith s7 = b[6+idx]; 195015091d37SBarry Smith while (nz--) { 195115091d37SBarry Smith jdx = 7*(*vi++); 195215091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 195315091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 195415091d37SBarry Smith x7 = x[6+jdx]; 1955f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1956f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1957f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1958f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1959f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1960f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1961f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 196215091d37SBarry Smith v += 49; 196315091d37SBarry Smith } 1964f1af5d2fSBarry Smith x[idx] = s1; 1965f1af5d2fSBarry Smith x[1+idx] = s2; 1966f1af5d2fSBarry Smith x[2+idx] = s3; 1967f1af5d2fSBarry Smith x[3+idx] = s4; 1968f1af5d2fSBarry Smith x[4+idx] = s5; 1969f1af5d2fSBarry Smith x[5+idx] = s6; 1970f1af5d2fSBarry Smith x[6+idx] = s7; 197115091d37SBarry Smith } 197215091d37SBarry Smith /* backward solve the upper triangular */ 197315091d37SBarry Smith for (i=n-1; i>=0; i--){ 197415091d37SBarry Smith v = aa + 49*diag[i] + 49; 197515091d37SBarry Smith vi = aj + diag[i] + 1; 197615091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 197715091d37SBarry Smith idt = 7*i; 1978f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 1979f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 1980f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 1981f1af5d2fSBarry Smith s7 = x[6+idt]; 198215091d37SBarry Smith while (nz--) { 198315091d37SBarry Smith idx = 7*(*vi++); 198415091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 198515091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 198615091d37SBarry Smith x7 = x[6+idx]; 1987f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1988f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1989f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1990f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1991f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1992f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1993f1af5d2fSBarry Smith s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 199415091d37SBarry Smith v += 49; 199515091d37SBarry Smith } 199615091d37SBarry Smith v = aa + 49*diag[i]; 1997f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1998f1af5d2fSBarry Smith + v[28]*s5 + v[35]*s6 + v[42]*s7; 1999f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2000f1af5d2fSBarry Smith + v[29]*s5 + v[36]*s6 + v[43]*s7; 2001f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2002f1af5d2fSBarry Smith + v[30]*s5 + v[37]*s6 + v[44]*s7; 2003f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2004f1af5d2fSBarry Smith + v[31]*s5 + v[38]*s6 + v[45]*s7; 2005f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2006f1af5d2fSBarry Smith + v[32]*s5 + v[39]*s6 + v[46]*s7; 2007f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2008f1af5d2fSBarry Smith + v[33]*s5 + v[40]*s6 + v[47]*s7; 2009f1af5d2fSBarry Smith x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2010f1af5d2fSBarry Smith + v[34]*s5 + v[41]*s6 + v[48]*s7; 201115091d37SBarry Smith } 201215091d37SBarry Smith 2013d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 20141ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2015dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 201615091d37SBarry Smith PetscFunctionReturn(0); 201715091d37SBarry Smith } 201815091d37SBarry Smith 2019cee9d6f2SShri Abhyankar #undef __FUNCT__ 2020a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 2021a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 202253cca76cSShri Abhyankar { 202353cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 202453cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 202553cca76cSShri Abhyankar PetscErrorCode ierr; 202653cca76cSShri Abhyankar PetscInt idx,jdx,idt; 202753cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 202853cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 202953cca76cSShri Abhyankar PetscScalar *x; 203053cca76cSShri Abhyankar const PetscScalar *b; 203153cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 203253cca76cSShri Abhyankar 203353cca76cSShri Abhyankar PetscFunctionBegin; 203453cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 203553cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 203653cca76cSShri Abhyankar /* forward solve the lower triangular */ 203753cca76cSShri Abhyankar idx = 0; 203853cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 203953cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 204053cca76cSShri Abhyankar for (i=1; i<n; i++) { 204153cca76cSShri Abhyankar v = aa + bs2*ai[i]; 204253cca76cSShri Abhyankar vi = aj + ai[i]; 204353cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 204453cca76cSShri Abhyankar idx = bs*i; 204553cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 204653cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 204753cca76cSShri Abhyankar for(k=0;k<nz;k++) { 204853cca76cSShri Abhyankar jdx = bs*vi[k]; 204953cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 205053cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 205153cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 205253cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 205353cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 205453cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 205553cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 205653cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 205753cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 205853cca76cSShri Abhyankar v += bs2; 205953cca76cSShri Abhyankar } 206053cca76cSShri Abhyankar 206153cca76cSShri Abhyankar x[idx] = s1; 206253cca76cSShri Abhyankar x[1+idx] = s2; 206353cca76cSShri Abhyankar x[2+idx] = s3; 206453cca76cSShri Abhyankar x[3+idx] = s4; 206553cca76cSShri Abhyankar x[4+idx] = s5; 206653cca76cSShri Abhyankar x[5+idx] = s6; 206753cca76cSShri Abhyankar x[6+idx] = s7; 206853cca76cSShri Abhyankar } 206953cca76cSShri Abhyankar 207053cca76cSShri Abhyankar /* backward solve the upper triangular */ 207153cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 207253cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 207353cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 207453cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 207553cca76cSShri Abhyankar idt = bs*i; 207653cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 207753cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 207853cca76cSShri Abhyankar for(k=0;k<nz;k++) { 207953cca76cSShri Abhyankar idx = bs*vi[k]; 208053cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 208153cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 208253cca76cSShri Abhyankar s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 208353cca76cSShri Abhyankar s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 208453cca76cSShri Abhyankar s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 208553cca76cSShri Abhyankar s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 208653cca76cSShri Abhyankar s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 208753cca76cSShri Abhyankar s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 208853cca76cSShri Abhyankar s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 208953cca76cSShri Abhyankar v += bs2; 209053cca76cSShri Abhyankar } 209153cca76cSShri Abhyankar /* x = inv_diagonal*x */ 209253cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 209353cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 209453cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 209553cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 209653cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 209753cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 209853cca76cSShri Abhyankar x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 209953cca76cSShri Abhyankar } 210053cca76cSShri Abhyankar 210153cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 210253cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 210353cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 210453cca76cSShri Abhyankar PetscFunctionReturn(0); 210553cca76cSShri Abhyankar } 210653cca76cSShri Abhyankar 210753cca76cSShri Abhyankar #undef __FUNCT__ 21084a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6" 2109dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 211015091d37SBarry Smith { 211115091d37SBarry Smith Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 211215091d37SBarry Smith IS iscol=a->col,isrow=a->row; 21136849ba73SBarry Smith PetscErrorCode ierr; 21145d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout; 21155d0c19d7SBarry Smith PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2116d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2117d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2118d9fead3dSBarry Smith const PetscScalar *b; 211915091d37SBarry Smith PetscFunctionBegin; 2120d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21211ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2122f1af5d2fSBarry Smith t = a->solve_work; 212315091d37SBarry Smith 212415091d37SBarry Smith ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 212515091d37SBarry Smith ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 212615091d37SBarry Smith 212715091d37SBarry Smith /* forward solve the lower triangular */ 212815091d37SBarry Smith idx = 6*(*r++); 2129f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2130f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 2131f1af5d2fSBarry Smith t[4] = b[4+idx]; t[5] = b[5+idx]; 213215091d37SBarry Smith for (i=1; i<n; i++) { 213315091d37SBarry Smith v = aa + 36*ai[i]; 213415091d37SBarry Smith vi = aj + ai[i]; 213515091d37SBarry Smith nz = diag[i] - ai[i]; 213615091d37SBarry Smith idx = 6*(*r++); 2137f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2138f1af5d2fSBarry Smith s5 = b[4+idx]; s6 = b[5+idx]; 213915091d37SBarry Smith while (nz--) { 214015091d37SBarry Smith idx = 6*(*vi++); 2141f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2142f1af5d2fSBarry Smith x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2143f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2144f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2145f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2146f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2147f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2148f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 214915091d37SBarry Smith v += 36; 215015091d37SBarry Smith } 215115091d37SBarry Smith idx = 6*i; 2152f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2153f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 2154f1af5d2fSBarry Smith t[4+idx] = s5;t[5+idx] = s6; 215515091d37SBarry Smith } 215615091d37SBarry Smith /* backward solve the upper triangular */ 215715091d37SBarry Smith for (i=n-1; i>=0; i--){ 215815091d37SBarry Smith v = aa + 36*diag[i] + 36; 215915091d37SBarry Smith vi = aj + diag[i] + 1; 216015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 216115091d37SBarry Smith idt = 6*i; 2162f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2163f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 2164f1af5d2fSBarry Smith s5 = t[4+idt];s6 = t[5+idt]; 216515091d37SBarry Smith while (nz--) { 216615091d37SBarry Smith idx = 6*(*vi++); 2167f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2168f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2169f1af5d2fSBarry Smith x5 = t[4+idx]; x6 = t[5+idx]; 2170f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2171f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2172f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2173f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2174f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2175f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 217615091d37SBarry Smith v += 36; 217715091d37SBarry Smith } 217815091d37SBarry Smith idc = 6*(*c--); 217915091d37SBarry Smith v = aa + 36*diag[i]; 2180f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2181f1af5d2fSBarry Smith v[18]*s4+v[24]*s5+v[30]*s6; 2182f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2183f1af5d2fSBarry Smith v[19]*s4+v[25]*s5+v[31]*s6; 2184f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2185f1af5d2fSBarry Smith v[20]*s4+v[26]*s5+v[32]*s6; 2186f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2187f1af5d2fSBarry Smith v[21]*s4+v[27]*s5+v[33]*s6; 2188f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2189f1af5d2fSBarry Smith v[22]*s4+v[28]*s5+v[34]*s6; 2190f1af5d2fSBarry Smith x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2191f1af5d2fSBarry Smith v[23]*s4+v[29]*s5+v[35]*s6; 219215091d37SBarry Smith } 219315091d37SBarry Smith 219415091d37SBarry Smith ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 219515091d37SBarry Smith ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2196d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 21971ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2198dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 219915091d37SBarry Smith PetscFunctionReturn(0); 220015091d37SBarry Smith } 220115091d37SBarry Smith 22026506fda5SShri Abhyankar #undef __FUNCT__ 2203a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 2204a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 22056506fda5SShri Abhyankar { 22066506fda5SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 22076506fda5SShri Abhyankar IS iscol=a->col,isrow=a->row; 22086506fda5SShri Abhyankar PetscErrorCode ierr; 22096506fda5SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 22106506fda5SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 22116506fda5SShri Abhyankar const MatScalar *aa=a->a,*v; 22126506fda5SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 22136506fda5SShri Abhyankar const PetscScalar *b; 22146506fda5SShri Abhyankar PetscFunctionBegin; 22156506fda5SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22166506fda5SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 22176506fda5SShri Abhyankar t = a->solve_work; 22186506fda5SShri Abhyankar 22196506fda5SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 22206506fda5SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 22216506fda5SShri Abhyankar 22226506fda5SShri Abhyankar /* forward solve the lower triangular */ 22236506fda5SShri Abhyankar idx = 6*r[0]; 22246506fda5SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 22256506fda5SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 22266506fda5SShri Abhyankar t[4] = b[4+idx]; t[5] = b[5+idx]; 22276506fda5SShri Abhyankar for (i=1; i<n; i++) { 22286506fda5SShri Abhyankar v = aa + 36*ai[i]; 22296506fda5SShri Abhyankar vi = aj + ai[i]; 22306506fda5SShri Abhyankar nz = ai[i+1] - ai[i]; 22316506fda5SShri Abhyankar idx = 6*r[i]; 22326506fda5SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 22336506fda5SShri Abhyankar s5 = b[4+idx]; s6 = b[5+idx]; 22346506fda5SShri Abhyankar for(m=0;m<nz;m++){ 22356506fda5SShri Abhyankar idx = 6*vi[m]; 22366506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 22376506fda5SShri Abhyankar x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 22386506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 22396506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 22406506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 22416506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 22426506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 22436506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 22446506fda5SShri Abhyankar v += 36; 22456506fda5SShri Abhyankar } 22466506fda5SShri Abhyankar idx = 6*i; 22476506fda5SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 22486506fda5SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 22496506fda5SShri Abhyankar t[4+idx] = s5;t[5+idx] = s6; 22506506fda5SShri Abhyankar } 22516506fda5SShri Abhyankar /* backward solve the upper triangular */ 22526506fda5SShri Abhyankar for (i=n-1; i>=0; i--){ 22536506fda5SShri Abhyankar v = aa + 36*(adiag[i+1]+1); 22546506fda5SShri Abhyankar vi = aj + adiag[i+1]+1; 22556506fda5SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 22566506fda5SShri Abhyankar idt = 6*i; 22576506fda5SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 22586506fda5SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 22596506fda5SShri Abhyankar s5 = t[4+idt];s6 = t[5+idt]; 22606506fda5SShri Abhyankar for(m=0;m<nz;m++){ 22616506fda5SShri Abhyankar idx = 6*vi[m]; 22626506fda5SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 22636506fda5SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 22646506fda5SShri Abhyankar x5 = t[4+idx]; x6 = t[5+idx]; 22656506fda5SShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 22666506fda5SShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 22676506fda5SShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 22686506fda5SShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 22696506fda5SShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 22706506fda5SShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 22716506fda5SShri Abhyankar v += 36; 22726506fda5SShri Abhyankar } 22736506fda5SShri Abhyankar idc = 6*c[i]; 22746506fda5SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 22756506fda5SShri Abhyankar v[18]*s4+v[24]*s5+v[30]*s6; 22766506fda5SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 22776506fda5SShri Abhyankar v[19]*s4+v[25]*s5+v[31]*s6; 22786506fda5SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 22796506fda5SShri Abhyankar v[20]*s4+v[26]*s5+v[32]*s6; 22806506fda5SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 22816506fda5SShri Abhyankar v[21]*s4+v[27]*s5+v[33]*s6; 22826506fda5SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 22836506fda5SShri Abhyankar v[22]*s4+v[28]*s5+v[34]*s6; 22846506fda5SShri Abhyankar x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 22856506fda5SShri Abhyankar v[23]*s4+v[29]*s5+v[35]*s6; 22866506fda5SShri Abhyankar } 22876506fda5SShri Abhyankar 22886506fda5SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 22896506fda5SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 22906506fda5SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 22916506fda5SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 22926506fda5SShri Abhyankar ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 22936506fda5SShri Abhyankar PetscFunctionReturn(0); 22946506fda5SShri Abhyankar } 22958f690400SShri Abhyankar 22968f690400SShri Abhyankar #undef __FUNCT__ 22974a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 2298dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 229915091d37SBarry Smith { 230015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2301690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2302dfbe8321SBarry Smith PetscErrorCode ierr; 2303690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2304d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2305d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2306d9fead3dSBarry Smith const PetscScalar *b; 230715091d37SBarry Smith 230815091d37SBarry Smith PetscFunctionBegin; 2309d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23101ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 231115091d37SBarry Smith /* forward solve the lower triangular */ 231215091d37SBarry Smith idx = 0; 231315091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 231415091d37SBarry Smith x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 231515091d37SBarry Smith for (i=1; i<n; i++) { 231615091d37SBarry Smith v = aa + 36*ai[i]; 231715091d37SBarry Smith vi = aj + ai[i]; 231815091d37SBarry Smith nz = diag[i] - ai[i]; 231915091d37SBarry Smith idx = 6*i; 2320f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2321f1af5d2fSBarry Smith s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 232215091d37SBarry Smith while (nz--) { 232315091d37SBarry Smith jdx = 6*(*vi++); 232415091d37SBarry Smith x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 232515091d37SBarry Smith x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2326f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2327f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2328f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2329f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2330f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2331f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 233215091d37SBarry Smith v += 36; 233315091d37SBarry Smith } 2334f1af5d2fSBarry Smith x[idx] = s1; 2335f1af5d2fSBarry Smith x[1+idx] = s2; 2336f1af5d2fSBarry Smith x[2+idx] = s3; 2337f1af5d2fSBarry Smith x[3+idx] = s4; 2338f1af5d2fSBarry Smith x[4+idx] = s5; 2339f1af5d2fSBarry Smith x[5+idx] = s6; 234015091d37SBarry Smith } 234115091d37SBarry Smith /* backward solve the upper triangular */ 234215091d37SBarry Smith for (i=n-1; i>=0; i--){ 234315091d37SBarry Smith v = aa + 36*diag[i] + 36; 234415091d37SBarry Smith vi = aj + diag[i] + 1; 234515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 234615091d37SBarry Smith idt = 6*i; 2347f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2348f1af5d2fSBarry Smith s3 = x[2+idt]; s4 = x[3+idt]; 2349f1af5d2fSBarry Smith s5 = x[4+idt]; s6 = x[5+idt]; 235015091d37SBarry Smith while (nz--) { 235115091d37SBarry Smith idx = 6*(*vi++); 235215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 235315091d37SBarry Smith x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2354f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2355f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2356f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2357f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2358f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2359f1af5d2fSBarry Smith s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 236015091d37SBarry Smith v += 36; 236115091d37SBarry Smith } 236215091d37SBarry Smith v = aa + 36*diag[i]; 2363f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2364f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2365f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2366f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2367f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2368f1af5d2fSBarry Smith x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 236915091d37SBarry Smith } 237015091d37SBarry Smith 2371d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 23721ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2373dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 237415091d37SBarry Smith PetscFunctionReturn(0); 237515091d37SBarry Smith } 237615091d37SBarry Smith 2377cee9d6f2SShri Abhyankar #undef __FUNCT__ 2378a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2379a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 238053cca76cSShri Abhyankar { 238153cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 238253cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 238353cca76cSShri Abhyankar PetscErrorCode ierr; 238453cca76cSShri Abhyankar PetscInt idx,jdx,idt; 238553cca76cSShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 238653cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 238753cca76cSShri Abhyankar PetscScalar *x; 238853cca76cSShri Abhyankar const PetscScalar *b; 238953cca76cSShri Abhyankar PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 239053cca76cSShri Abhyankar 239153cca76cSShri Abhyankar PetscFunctionBegin; 239253cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 239353cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 239453cca76cSShri Abhyankar /* forward solve the lower triangular */ 239553cca76cSShri Abhyankar idx = 0; 239653cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 239753cca76cSShri Abhyankar x[4] = b[4+idx];x[5] = b[5+idx]; 239853cca76cSShri Abhyankar for (i=1; i<n; i++) { 239953cca76cSShri Abhyankar v = aa + bs2*ai[i]; 240053cca76cSShri Abhyankar vi = aj + ai[i]; 240153cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 240253cca76cSShri Abhyankar idx = bs*i; 240353cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 240453cca76cSShri Abhyankar s5 = b[4+idx];s6 = b[5+idx]; 240553cca76cSShri Abhyankar for(k=0;k<nz;k++){ 240653cca76cSShri Abhyankar jdx = bs*vi[k]; 240753cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 240853cca76cSShri Abhyankar x5 = x[4+jdx]; x6 = x[5+jdx]; 240953cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 241053cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 241153cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 241253cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 241353cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 241453cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 241553cca76cSShri Abhyankar v += bs2; 241653cca76cSShri Abhyankar } 241753cca76cSShri Abhyankar 241853cca76cSShri Abhyankar x[idx] = s1; 241953cca76cSShri Abhyankar x[1+idx] = s2; 242053cca76cSShri Abhyankar x[2+idx] = s3; 242153cca76cSShri Abhyankar x[3+idx] = s4; 242253cca76cSShri Abhyankar x[4+idx] = s5; 242353cca76cSShri Abhyankar x[5+idx] = s6; 242453cca76cSShri Abhyankar } 242553cca76cSShri Abhyankar 242653cca76cSShri Abhyankar /* backward solve the upper triangular */ 242753cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 242853cca76cSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 242953cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 243053cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 243153cca76cSShri Abhyankar idt = bs*i; 243253cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 243353cca76cSShri Abhyankar s5 = x[4+idt];s6 = x[5+idt]; 243453cca76cSShri Abhyankar for(k=0;k<nz;k++){ 243553cca76cSShri Abhyankar idx = bs*vi[k]; 243653cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 243753cca76cSShri Abhyankar x5 = x[4+idx];x6 = x[5+idx]; 243853cca76cSShri Abhyankar s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 243953cca76cSShri Abhyankar s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 244053cca76cSShri Abhyankar s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 244153cca76cSShri Abhyankar s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 244253cca76cSShri Abhyankar s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 244353cca76cSShri Abhyankar s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 244453cca76cSShri Abhyankar v += bs2; 244553cca76cSShri Abhyankar } 244653cca76cSShri Abhyankar /* x = inv_diagonal*x */ 244753cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 244853cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 244953cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 245053cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 245153cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 245253cca76cSShri Abhyankar x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 245353cca76cSShri Abhyankar } 245453cca76cSShri Abhyankar 245553cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 245653cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 245753cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 245853cca76cSShri Abhyankar PetscFunctionReturn(0); 245953cca76cSShri Abhyankar } 246053cca76cSShri Abhyankar 246153cca76cSShri Abhyankar #undef __FUNCT__ 24624a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2463dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 24644e2b4712SSatish Balay { 24654e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 24664e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 24676849ba73SBarry Smith PetscErrorCode ierr; 24685d0c19d7SBarry Smith const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 24695d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2470d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2471d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2472d9fead3dSBarry Smith const PetscScalar *b; 24734e2b4712SSatish Balay 24744e2b4712SSatish Balay PetscFunctionBegin; 2475d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 24761ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2477f1af5d2fSBarry Smith t = a->solve_work; 24784e2b4712SSatish Balay 24794e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 24804e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 24814e2b4712SSatish Balay 24824e2b4712SSatish Balay /* forward solve the lower triangular */ 24834e2b4712SSatish Balay idx = 5*(*r++); 2484f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2485f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 24864e2b4712SSatish Balay for (i=1; i<n; i++) { 24874e2b4712SSatish Balay v = aa + 25*ai[i]; 24884e2b4712SSatish Balay vi = aj + ai[i]; 24894e2b4712SSatish Balay nz = diag[i] - ai[i]; 24904e2b4712SSatish Balay idx = 5*(*r++); 2491f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2492f1af5d2fSBarry Smith s5 = b[4+idx]; 24934e2b4712SSatish Balay while (nz--) { 24944e2b4712SSatish Balay idx = 5*(*vi++); 2495f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2496f1af5d2fSBarry Smith x4 = t[3+idx];x5 = t[4+idx]; 2497f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2498f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2499f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2500f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2501f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 25024e2b4712SSatish Balay v += 25; 25034e2b4712SSatish Balay } 25044e2b4712SSatish Balay idx = 5*i; 2505f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2506f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 25074e2b4712SSatish Balay } 25084e2b4712SSatish Balay /* backward solve the upper triangular */ 25094e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 25104e2b4712SSatish Balay v = aa + 25*diag[i] + 25; 25114e2b4712SSatish Balay vi = aj + diag[i] + 1; 25124e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 25134e2b4712SSatish Balay idt = 5*i; 2514f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2515f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 25164e2b4712SSatish Balay while (nz--) { 25174e2b4712SSatish Balay idx = 5*(*vi++); 2518f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2519f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2520f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2521f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2522f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2523f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2524f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 25254e2b4712SSatish Balay v += 25; 25264e2b4712SSatish Balay } 25274e2b4712SSatish Balay idc = 5*(*c--); 25284e2b4712SSatish Balay v = aa + 25*diag[i]; 2529f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2530f1af5d2fSBarry Smith v[15]*s4+v[20]*s5; 2531f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2532f1af5d2fSBarry Smith v[16]*s4+v[21]*s5; 2533f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2534f1af5d2fSBarry Smith v[17]*s4+v[22]*s5; 2535f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2536f1af5d2fSBarry Smith v[18]*s4+v[23]*s5; 2537f1af5d2fSBarry Smith x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2538f1af5d2fSBarry Smith v[19]*s4+v[24]*s5; 25394e2b4712SSatish Balay } 25404e2b4712SSatish Balay 25414e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 25424e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2543d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 25441ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2545dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 25464e2b4712SSatish Balay PetscFunctionReturn(0); 25474e2b4712SSatish Balay } 25484e2b4712SSatish Balay 254978bb4007SShri Abhyankar #undef __FUNCT__ 2550a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 2551a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 255278bb4007SShri Abhyankar { 255378bb4007SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 255478bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 255578bb4007SShri Abhyankar PetscErrorCode ierr; 255678bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 255778bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 255878bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 255978bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 256078bb4007SShri Abhyankar const PetscScalar *b; 256178bb4007SShri Abhyankar 256278bb4007SShri Abhyankar PetscFunctionBegin; 256378bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 256478bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 256578bb4007SShri Abhyankar t = a->solve_work; 256678bb4007SShri Abhyankar 256778bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 256878bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 256978bb4007SShri Abhyankar 257078bb4007SShri Abhyankar /* forward solve the lower triangular */ 257178bb4007SShri Abhyankar idx = 5*r[0]; 257278bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 257378bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 257478bb4007SShri Abhyankar for (i=1; i<n; i++) { 257578bb4007SShri Abhyankar v = aa + 25*ai[i]; 257678bb4007SShri Abhyankar vi = aj + ai[i]; 257778bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 257878bb4007SShri Abhyankar idx = 5*r[i]; 257978bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 258078bb4007SShri Abhyankar s5 = b[4+idx]; 258178bb4007SShri Abhyankar for(m=0;m<nz;m++){ 258278bb4007SShri Abhyankar idx = 5*vi[m]; 258378bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 258478bb4007SShri Abhyankar x4 = t[3+idx];x5 = t[4+idx]; 258578bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 258678bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 258778bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 258878bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 258978bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 259078bb4007SShri Abhyankar v += 25; 259178bb4007SShri Abhyankar } 259278bb4007SShri Abhyankar idx = 5*i; 259378bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 259478bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 259578bb4007SShri Abhyankar } 259678bb4007SShri Abhyankar /* backward solve the upper triangular */ 259778bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 259878bb4007SShri Abhyankar v = aa + 25*(adiag[i+1]+1); 259978bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 260078bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 260178bb4007SShri Abhyankar idt = 5*i; 260278bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 260378bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 260478bb4007SShri Abhyankar for(m=0;m<nz;m++){ 260578bb4007SShri Abhyankar idx = 5*vi[m]; 260678bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 260778bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 260878bb4007SShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 260978bb4007SShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 261078bb4007SShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 261178bb4007SShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 261278bb4007SShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 261378bb4007SShri Abhyankar v += 25; 261478bb4007SShri Abhyankar } 261578bb4007SShri Abhyankar idc = 5*c[i]; 261678bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 261778bb4007SShri Abhyankar v[15]*s4+v[20]*s5; 261878bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 261978bb4007SShri Abhyankar v[16]*s4+v[21]*s5; 262078bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 262178bb4007SShri Abhyankar v[17]*s4+v[22]*s5; 262278bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 262378bb4007SShri Abhyankar v[18]*s4+v[23]*s5; 262478bb4007SShri Abhyankar x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 262578bb4007SShri Abhyankar v[19]*s4+v[24]*s5; 262678bb4007SShri Abhyankar } 262778bb4007SShri Abhyankar 262878bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 262978bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 263078bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 263178bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 263278bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 263378bb4007SShri Abhyankar PetscFunctionReturn(0); 263478bb4007SShri Abhyankar } 263578bb4007SShri Abhyankar 26368f690400SShri Abhyankar #undef __FUNCT__ 26374a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2638dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 263915091d37SBarry Smith { 264015091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2641690b6cddSBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2642dfbe8321SBarry Smith PetscErrorCode ierr; 2643690b6cddSBarry Smith PetscInt *diag = a->diag,jdx; 2644d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2645d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2646d9fead3dSBarry Smith const PetscScalar *b; 264715091d37SBarry Smith 264815091d37SBarry Smith PetscFunctionBegin; 2649d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 26501ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 265115091d37SBarry Smith /* forward solve the lower triangular */ 265215091d37SBarry Smith idx = 0; 265315091d37SBarry Smith x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 265415091d37SBarry Smith for (i=1; i<n; i++) { 265515091d37SBarry Smith v = aa + 25*ai[i]; 265615091d37SBarry Smith vi = aj + ai[i]; 265715091d37SBarry Smith nz = diag[i] - ai[i]; 265815091d37SBarry Smith idx = 5*i; 2659f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 266015091d37SBarry Smith while (nz--) { 266115091d37SBarry Smith jdx = 5*(*vi++); 266215091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2663f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2664f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2665f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2666f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2667f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 266815091d37SBarry Smith v += 25; 266915091d37SBarry Smith } 2670f1af5d2fSBarry Smith x[idx] = s1; 2671f1af5d2fSBarry Smith x[1+idx] = s2; 2672f1af5d2fSBarry Smith x[2+idx] = s3; 2673f1af5d2fSBarry Smith x[3+idx] = s4; 2674f1af5d2fSBarry Smith x[4+idx] = s5; 267515091d37SBarry Smith } 267615091d37SBarry Smith /* backward solve the upper triangular */ 267715091d37SBarry Smith for (i=n-1; i>=0; i--){ 267815091d37SBarry Smith v = aa + 25*diag[i] + 25; 267915091d37SBarry Smith vi = aj + diag[i] + 1; 268015091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 268115091d37SBarry Smith idt = 5*i; 2682f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 2683f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 268415091d37SBarry Smith while (nz--) { 268515091d37SBarry Smith idx = 5*(*vi++); 268615091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2687f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2688f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2689f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2690f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2691f1af5d2fSBarry Smith s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 269215091d37SBarry Smith v += 25; 269315091d37SBarry Smith } 269415091d37SBarry Smith v = aa + 25*diag[i]; 2695f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2696f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2697f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2698f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2699f1af5d2fSBarry Smith x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 270015091d37SBarry Smith } 270115091d37SBarry Smith 2702d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27031ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2704dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 270515091d37SBarry Smith PetscFunctionReturn(0); 270615091d37SBarry Smith } 270715091d37SBarry Smith 2708cee9d6f2SShri Abhyankar #undef __FUNCT__ 2709a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2710a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 271153cca76cSShri Abhyankar { 271253cca76cSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 271353cca76cSShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 271453cca76cSShri Abhyankar PetscErrorCode ierr; 271553cca76cSShri Abhyankar PetscInt jdx; 271653cca76cSShri Abhyankar const MatScalar *aa=a->a,*v; 271753cca76cSShri Abhyankar PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 271853cca76cSShri Abhyankar const PetscScalar *b; 271953cca76cSShri Abhyankar 272053cca76cSShri Abhyankar PetscFunctionBegin; 272153cca76cSShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 272253cca76cSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 272353cca76cSShri Abhyankar /* forward solve the lower triangular */ 272453cca76cSShri Abhyankar idx = 0; 272553cca76cSShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 272653cca76cSShri Abhyankar for (i=1; i<n; i++) { 272753cca76cSShri Abhyankar v = aa + 25*ai[i]; 272853cca76cSShri Abhyankar vi = aj + ai[i]; 272953cca76cSShri Abhyankar nz = ai[i+1] - ai[i]; 273053cca76cSShri Abhyankar idx = 5*i; 273153cca76cSShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 273253cca76cSShri Abhyankar for(k=0;k<nz;k++) { 273353cca76cSShri Abhyankar jdx = 5*vi[k]; 273453cca76cSShri Abhyankar x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 273553cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 273653cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 273753cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 273853cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 273953cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 274053cca76cSShri Abhyankar v += 25; 274153cca76cSShri Abhyankar } 274253cca76cSShri Abhyankar x[idx] = s1; 274353cca76cSShri Abhyankar x[1+idx] = s2; 274453cca76cSShri Abhyankar x[2+idx] = s3; 274553cca76cSShri Abhyankar x[3+idx] = s4; 274653cca76cSShri Abhyankar x[4+idx] = s5; 274753cca76cSShri Abhyankar } 274853cca76cSShri Abhyankar 274953cca76cSShri Abhyankar /* backward solve the upper triangular */ 275053cca76cSShri Abhyankar for (i=n-1; i>=0; i--){ 275153cca76cSShri Abhyankar v = aa + 25*(adiag[i+1]+1); 275253cca76cSShri Abhyankar vi = aj + adiag[i+1]+1; 275353cca76cSShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 275453cca76cSShri Abhyankar idt = 5*i; 275553cca76cSShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 275653cca76cSShri Abhyankar s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 275753cca76cSShri Abhyankar for(k=0;k<nz;k++){ 275853cca76cSShri Abhyankar idx = 5*vi[k]; 275953cca76cSShri Abhyankar x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 276053cca76cSShri Abhyankar s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 276153cca76cSShri Abhyankar s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 276253cca76cSShri Abhyankar s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 276353cca76cSShri Abhyankar s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 276453cca76cSShri Abhyankar s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 276553cca76cSShri Abhyankar v += 25; 276653cca76cSShri Abhyankar } 276753cca76cSShri Abhyankar /* x = inv_diagonal*x */ 276853cca76cSShri Abhyankar x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 276953cca76cSShri Abhyankar x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 277053cca76cSShri Abhyankar x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 277153cca76cSShri Abhyankar x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 277253cca76cSShri Abhyankar x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 277353cca76cSShri Abhyankar } 277453cca76cSShri Abhyankar 277553cca76cSShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 277653cca76cSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 277753cca76cSShri Abhyankar ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 277853cca76cSShri Abhyankar PetscFunctionReturn(0); 277953cca76cSShri Abhyankar } 278053cca76cSShri Abhyankar 278153cca76cSShri Abhyankar #undef __FUNCT__ 27824a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2783dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 27844e2b4712SSatish Balay { 27854e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 27864e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 27876849ba73SBarry Smith PetscErrorCode ierr; 27885d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 27895d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2790d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2791d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2792d9fead3dSBarry Smith const PetscScalar *b; 27934e2b4712SSatish Balay 27944e2b4712SSatish Balay PetscFunctionBegin; 2795d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27961ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2797f1af5d2fSBarry Smith t = a->solve_work; 27984e2b4712SSatish Balay 27994e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 28004e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 28014e2b4712SSatish Balay 28024e2b4712SSatish Balay /* forward solve the lower triangular */ 28034e2b4712SSatish Balay idx = 4*(*r++); 2804f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 2805f1af5d2fSBarry Smith t[2] = b[2+idx]; t[3] = b[3+idx]; 28064e2b4712SSatish Balay for (i=1; i<n; i++) { 28074e2b4712SSatish Balay v = aa + 16*ai[i]; 28084e2b4712SSatish Balay vi = aj + ai[i]; 28094e2b4712SSatish Balay nz = diag[i] - ai[i]; 28104e2b4712SSatish Balay idx = 4*(*r++); 2811f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 28124e2b4712SSatish Balay while (nz--) { 28134e2b4712SSatish Balay idx = 4*(*vi++); 2814f1af5d2fSBarry Smith x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2815f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2816f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2817f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2818f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 28194e2b4712SSatish Balay v += 16; 28204e2b4712SSatish Balay } 28214e2b4712SSatish Balay idx = 4*i; 2822f1af5d2fSBarry Smith t[idx] = s1;t[1+idx] = s2; 2823f1af5d2fSBarry Smith t[2+idx] = s3;t[3+idx] = s4; 28244e2b4712SSatish Balay } 28254e2b4712SSatish Balay /* backward solve the upper triangular */ 28264e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 28274e2b4712SSatish Balay v = aa + 16*diag[i] + 16; 28284e2b4712SSatish Balay vi = aj + diag[i] + 1; 28294e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 28304e2b4712SSatish Balay idt = 4*i; 2831f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 2832f1af5d2fSBarry Smith s3 = t[2+idt];s4 = t[3+idt]; 28334e2b4712SSatish Balay while (nz--) { 28344e2b4712SSatish Balay idx = 4*(*vi++); 2835f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 2836f1af5d2fSBarry Smith x3 = t[2+idx]; x4 = t[3+idx]; 2837f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2838f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2839f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2840f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 28414e2b4712SSatish Balay v += 16; 28424e2b4712SSatish Balay } 28434e2b4712SSatish Balay idc = 4*(*c--); 28444e2b4712SSatish Balay v = aa + 16*diag[i]; 2845f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2846f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2847f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2848f1af5d2fSBarry Smith x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 28494e2b4712SSatish Balay } 28504e2b4712SSatish Balay 28514e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 28524e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2853d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 28541ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2855dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 28564e2b4712SSatish Balay PetscFunctionReturn(0); 28574e2b4712SSatish Balay } 2858f26ec98cSKris Buschelman 28598f690400SShri Abhyankar #undef __FUNCT__ 2860a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 2861a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 286278bb4007SShri Abhyankar { 286378bb4007SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 286478bb4007SShri Abhyankar IS iscol=a->col,isrow=a->row; 286578bb4007SShri Abhyankar PetscErrorCode ierr; 286678bb4007SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 286778bb4007SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 286878bb4007SShri Abhyankar const MatScalar *aa=a->a,*v; 286978bb4007SShri Abhyankar PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 287078bb4007SShri Abhyankar const PetscScalar *b; 287178bb4007SShri Abhyankar 287278bb4007SShri Abhyankar PetscFunctionBegin; 287378bb4007SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 287478bb4007SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 287578bb4007SShri Abhyankar t = a->solve_work; 287678bb4007SShri Abhyankar 287778bb4007SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 287878bb4007SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 287978bb4007SShri Abhyankar 288078bb4007SShri Abhyankar /* forward solve the lower triangular */ 288178bb4007SShri Abhyankar idx = 4*r[0]; 288278bb4007SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 288378bb4007SShri Abhyankar t[2] = b[2+idx]; t[3] = b[3+idx]; 288478bb4007SShri Abhyankar for (i=1; i<n; i++) { 288578bb4007SShri Abhyankar v = aa + 16*ai[i]; 288678bb4007SShri Abhyankar vi = aj + ai[i]; 288778bb4007SShri Abhyankar nz = ai[i+1] - ai[i]; 288878bb4007SShri Abhyankar idx = 4*r[i]; 288978bb4007SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 289078bb4007SShri Abhyankar for(m=0;m<nz;m++){ 289178bb4007SShri Abhyankar idx = 4*vi[m]; 289278bb4007SShri Abhyankar x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 289378bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 289478bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 289578bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 289678bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 289778bb4007SShri Abhyankar v += 16; 289878bb4007SShri Abhyankar } 289978bb4007SShri Abhyankar idx = 4*i; 290078bb4007SShri Abhyankar t[idx] = s1;t[1+idx] = s2; 290178bb4007SShri Abhyankar t[2+idx] = s3;t[3+idx] = s4; 290278bb4007SShri Abhyankar } 290378bb4007SShri Abhyankar /* backward solve the upper triangular */ 290478bb4007SShri Abhyankar for (i=n-1; i>=0; i--){ 290578bb4007SShri Abhyankar v = aa + 16*(adiag[i+1]+1); 290678bb4007SShri Abhyankar vi = aj + adiag[i+1]+1; 290778bb4007SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 290878bb4007SShri Abhyankar idt = 4*i; 290978bb4007SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 291078bb4007SShri Abhyankar s3 = t[2+idt];s4 = t[3+idt]; 291178bb4007SShri Abhyankar for(m=0;m<nz;m++){ 291278bb4007SShri Abhyankar idx = 4*vi[m]; 291378bb4007SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 291478bb4007SShri Abhyankar x3 = t[2+idx]; x4 = t[3+idx]; 291578bb4007SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 291678bb4007SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 291778bb4007SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 291878bb4007SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 291978bb4007SShri Abhyankar v += 16; 292078bb4007SShri Abhyankar } 292178bb4007SShri Abhyankar idc = 4*c[i]; 292278bb4007SShri Abhyankar x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 292378bb4007SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 292478bb4007SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 292578bb4007SShri Abhyankar x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 292678bb4007SShri Abhyankar } 292778bb4007SShri Abhyankar 292878bb4007SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 292978bb4007SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 293078bb4007SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 293178bb4007SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 293278bb4007SShri Abhyankar ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 293378bb4007SShri Abhyankar PetscFunctionReturn(0); 293478bb4007SShri Abhyankar } 293578bb4007SShri Abhyankar 293678bb4007SShri Abhyankar #undef __FUNCT__ 2937f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2938dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2939f26ec98cSKris Buschelman { 2940f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2941f26ec98cSKris Buschelman IS iscol=a->col,isrow=a->row; 29426849ba73SBarry Smith PetscErrorCode ierr; 29435d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 29445d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2945d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 2946d9fead3dSBarry Smith MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2947d9fead3dSBarry Smith PetscScalar *x; 2948d9fead3dSBarry Smith const PetscScalar *b; 2949f26ec98cSKris Buschelman 2950f26ec98cSKris Buschelman PetscFunctionBegin; 2951d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 29521ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2953f26ec98cSKris Buschelman t = (MatScalar *)a->solve_work; 2954f26ec98cSKris Buschelman 2955f26ec98cSKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2956f26ec98cSKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2957f26ec98cSKris Buschelman 2958f26ec98cSKris Buschelman /* forward solve the lower triangular */ 2959f26ec98cSKris Buschelman idx = 4*(*r++); 2960f26ec98cSKris Buschelman t[0] = (MatScalar)b[idx]; 2961f26ec98cSKris Buschelman t[1] = (MatScalar)b[1+idx]; 2962f26ec98cSKris Buschelman t[2] = (MatScalar)b[2+idx]; 2963f26ec98cSKris Buschelman t[3] = (MatScalar)b[3+idx]; 2964f26ec98cSKris Buschelman for (i=1; i<n; i++) { 2965f26ec98cSKris Buschelman v = aa + 16*ai[i]; 2966f26ec98cSKris Buschelman vi = aj + ai[i]; 2967f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 2968f26ec98cSKris Buschelman idx = 4*(*r++); 2969f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 2970f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 2971f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 2972f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 2973f26ec98cSKris Buschelman while (nz--) { 2974f26ec98cSKris Buschelman idx = 4*(*vi++); 2975f26ec98cSKris Buschelman x1 = t[idx]; 2976f26ec98cSKris Buschelman x2 = t[1+idx]; 2977f26ec98cSKris Buschelman x3 = t[2+idx]; 2978f26ec98cSKris Buschelman x4 = t[3+idx]; 2979f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2980f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2981f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2982f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2983f26ec98cSKris Buschelman v += 16; 2984f26ec98cSKris Buschelman } 2985f26ec98cSKris Buschelman idx = 4*i; 2986f26ec98cSKris Buschelman t[idx] = s1; 2987f26ec98cSKris Buschelman t[1+idx] = s2; 2988f26ec98cSKris Buschelman t[2+idx] = s3; 2989f26ec98cSKris Buschelman t[3+idx] = s4; 2990f26ec98cSKris Buschelman } 2991f26ec98cSKris Buschelman /* backward solve the upper triangular */ 2992f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 2993f26ec98cSKris Buschelman v = aa + 16*diag[i] + 16; 2994f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 2995f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 2996f26ec98cSKris Buschelman idt = 4*i; 2997f26ec98cSKris Buschelman s1 = t[idt]; 2998f26ec98cSKris Buschelman s2 = t[1+idt]; 2999f26ec98cSKris Buschelman s3 = t[2+idt]; 3000f26ec98cSKris Buschelman s4 = t[3+idt]; 3001f26ec98cSKris Buschelman while (nz--) { 3002f26ec98cSKris Buschelman idx = 4*(*vi++); 3003f26ec98cSKris Buschelman x1 = t[idx]; 3004f26ec98cSKris Buschelman x2 = t[1+idx]; 3005f26ec98cSKris Buschelman x3 = t[2+idx]; 3006f26ec98cSKris Buschelman x4 = t[3+idx]; 3007f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3008f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3009f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3010f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3011f26ec98cSKris Buschelman v += 16; 3012f26ec98cSKris Buschelman } 3013f26ec98cSKris Buschelman idc = 4*(*c--); 3014f26ec98cSKris Buschelman v = aa + 16*diag[i]; 3015f26ec98cSKris Buschelman t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3016f26ec98cSKris Buschelman t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3017f26ec98cSKris Buschelman t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3018f26ec98cSKris Buschelman t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3019f26ec98cSKris Buschelman x[idc] = (PetscScalar)t[idt]; 3020f26ec98cSKris Buschelman x[1+idc] = (PetscScalar)t[1+idt]; 3021f26ec98cSKris Buschelman x[2+idc] = (PetscScalar)t[2+idt]; 3022f26ec98cSKris Buschelman x[3+idc] = (PetscScalar)t[3+idt]; 3023f26ec98cSKris Buschelman } 3024f26ec98cSKris Buschelman 3025f26ec98cSKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3026f26ec98cSKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3027d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 30281ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3029dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3030f26ec98cSKris Buschelman PetscFunctionReturn(0); 3031f26ec98cSKris Buschelman } 3032f26ec98cSKris Buschelman 303324c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE) 303424c233c2SKris Buschelman 303524c233c2SKris Buschelman #include PETSC_HAVE_SSE 303624c233c2SKris Buschelman 303724c233c2SKris Buschelman #undef __FUNCT__ 303824c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3039dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 304024c233c2SKris Buschelman { 304124c233c2SKris Buschelman /* 304224c233c2SKris Buschelman Note: This code uses demotion of double 304324c233c2SKris Buschelman to float when performing the mixed-mode computation. 304424c233c2SKris Buschelman This may not be numerically reasonable for all applications. 304524c233c2SKris Buschelman */ 304624c233c2SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 304724c233c2SKris Buschelman IS iscol=a->col,isrow=a->row; 30486849ba73SBarry Smith PetscErrorCode ierr; 30495d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 30505d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 305124c233c2SKris Buschelman MatScalar *aa=a->a,*v; 305287828ca2SBarry Smith PetscScalar *x,*b,*t; 305324c233c2SKris Buschelman 305424c233c2SKris Buschelman /* Make space in temp stack for 16 Byte Aligned arrays */ 305524c233c2SKris Buschelman float ssealignedspace[11],*tmps,*tmpx; 305624c233c2SKris Buschelman unsigned long offset; 305724c233c2SKris Buschelman 305824c233c2SKris Buschelman PetscFunctionBegin; 305924c233c2SKris Buschelman SSE_SCOPE_BEGIN; 306024c233c2SKris Buschelman 306124c233c2SKris Buschelman offset = (unsigned long)ssealignedspace % 16; 306224c233c2SKris Buschelman if (offset) offset = (16 - offset)/4; 306324c233c2SKris Buschelman tmps = &ssealignedspace[offset]; 306424c233c2SKris Buschelman tmpx = &ssealignedspace[offset+4]; 306524c233c2SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 306624c233c2SKris Buschelman 30671ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 30681ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 306924c233c2SKris Buschelman t = a->solve_work; 307024c233c2SKris Buschelman 307124c233c2SKris Buschelman ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 307224c233c2SKris Buschelman ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 307324c233c2SKris Buschelman 307424c233c2SKris Buschelman /* forward solve the lower triangular */ 307524c233c2SKris Buschelman idx = 4*(*r++); 307624c233c2SKris Buschelman t[0] = b[idx]; t[1] = b[1+idx]; 307724c233c2SKris Buschelman t[2] = b[2+idx]; t[3] = b[3+idx]; 307824c233c2SKris Buschelman v = aa + 16*ai[1]; 307924c233c2SKris Buschelman 308024c233c2SKris Buschelman for (i=1; i<n;) { 308124c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 308224c233c2SKris Buschelman vi = aj + ai[i]; 308324c233c2SKris Buschelman nz = diag[i] - ai[i]; 308424c233c2SKris Buschelman idx = 4*(*r++); 308524c233c2SKris Buschelman 308624c233c2SKris Buschelman /* Demote sum from double to float */ 308724c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 308824c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 308924c233c2SKris Buschelman 309024c233c2SKris Buschelman while (nz--) { 309124c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 309224c233c2SKris Buschelman idx = 4*(*vi++); 309324c233c2SKris Buschelman 309424c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 309524c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 309624c233c2SKris Buschelman 309724c233c2SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 309824c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 309924c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 310024c233c2SKris Buschelman 310124c233c2SKris Buschelman /* First Column */ 310224c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 310324c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 310424c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 310524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 310624c233c2SKris Buschelman 310724c233c2SKris Buschelman /* Second Column */ 310824c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 310924c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 311024c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 311124c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 311224c233c2SKris Buschelman 311324c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 311424c233c2SKris Buschelman 311524c233c2SKris Buschelman /* Third Column */ 311624c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 311724c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 311824c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 311924c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 312024c233c2SKris Buschelman 312124c233c2SKris Buschelman /* Fourth Column */ 312224c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 312324c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 312424c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 312524c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 312624c233c2SKris Buschelman SSE_INLINE_END_2 312724c233c2SKris Buschelman 312824c233c2SKris Buschelman v += 16; 312924c233c2SKris Buschelman } 313024c233c2SKris Buschelman idx = 4*i; 313124c233c2SKris Buschelman v = aa + 16*ai[++i]; 313224c233c2SKris Buschelman PREFETCH_NTA(v); 313324c233c2SKris Buschelman STORE_PS(tmps,XMM7); 313424c233c2SKris Buschelman 313524c233c2SKris Buschelman /* Promote result from float to double */ 313624c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 313724c233c2SKris Buschelman } 313824c233c2SKris Buschelman /* backward solve the upper triangular */ 313924c233c2SKris Buschelman idt = 4*(n-1); 314024c233c2SKris Buschelman ai16 = 16*diag[n-1]; 314124c233c2SKris Buschelman v = aa + ai16 + 16; 314224c233c2SKris Buschelman for (i=n-1; i>=0;){ 314324c233c2SKris Buschelman PREFETCH_NTA(&v[8]); 314424c233c2SKris Buschelman vi = aj + diag[i] + 1; 314524c233c2SKris Buschelman nz = ai[i+1] - diag[i] - 1; 314624c233c2SKris Buschelman 314724c233c2SKris Buschelman /* Demote accumulator from double to float */ 314824c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 314924c233c2SKris Buschelman LOAD_PS(tmps,XMM7); 315024c233c2SKris Buschelman 315124c233c2SKris Buschelman while (nz--) { 315224c233c2SKris Buschelman PREFETCH_NTA(&v[16]); 315324c233c2SKris Buschelman idx = 4*(*vi++); 315424c233c2SKris Buschelman 315524c233c2SKris Buschelman /* Demote solution (so far) from double to float */ 315624c233c2SKris Buschelman CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 315724c233c2SKris Buschelman 315824c233c2SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 315924c233c2SKris Buschelman SSE_INLINE_BEGIN_2(tmpx,v) 316024c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 316124c233c2SKris Buschelman 316224c233c2SKris Buschelman /* First Column */ 316324c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 316424c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 316524c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 316624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 316724c233c2SKris Buschelman 316824c233c2SKris Buschelman /* Second Column */ 316924c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 317024c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 317124c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 317224c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 317324c233c2SKris Buschelman 317424c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 317524c233c2SKris Buschelman 317624c233c2SKris Buschelman /* Third Column */ 317724c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 317824c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 317924c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 318024c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 318124c233c2SKris Buschelman 318224c233c2SKris Buschelman /* Fourth Column */ 318324c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 318424c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 318524c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 318624c233c2SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 318724c233c2SKris Buschelman SSE_INLINE_END_2 318824c233c2SKris Buschelman v += 16; 318924c233c2SKris Buschelman } 319024c233c2SKris Buschelman v = aa + ai16; 319124c233c2SKris Buschelman ai16 = 16*diag[--i]; 319224c233c2SKris Buschelman PREFETCH_NTA(aa+ai16+16); 319324c233c2SKris Buschelman /* 319424c233c2SKris Buschelman Scale the result by the diagonal 4x4 block, 319524c233c2SKris Buschelman which was inverted as part of the factorization 319624c233c2SKris Buschelman */ 319724c233c2SKris Buschelman SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 319824c233c2SKris Buschelman /* First Column */ 319924c233c2SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 320024c233c2SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 320124c233c2SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 320224c233c2SKris Buschelman 320324c233c2SKris Buschelman /* Second Column */ 320424c233c2SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 320524c233c2SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 320624c233c2SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 320724c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 320824c233c2SKris Buschelman 320924c233c2SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 321024c233c2SKris Buschelman 321124c233c2SKris Buschelman /* Third Column */ 321224c233c2SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 321324c233c2SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 321424c233c2SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 321524c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 321624c233c2SKris Buschelman 321724c233c2SKris Buschelman /* Fourth Column */ 321824c233c2SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 321924c233c2SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 322024c233c2SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 322124c233c2SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 322224c233c2SKris Buschelman 322324c233c2SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 322424c233c2SKris Buschelman SSE_INLINE_END_3 322524c233c2SKris Buschelman 322624c233c2SKris Buschelman /* Promote solution from float to double */ 322724c233c2SKris Buschelman CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 322824c233c2SKris Buschelman 322924c233c2SKris Buschelman /* Apply reordering to t and stream into x. */ 323024c233c2SKris Buschelman /* This way, x doesn't pollute the cache. */ 323124c233c2SKris Buschelman /* Be careful with size: 2 doubles = 4 floats! */ 323224c233c2SKris Buschelman idc = 4*(*c--); 323324c233c2SKris Buschelman SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 323424c233c2SKris Buschelman /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 323524c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 323624c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 323724c233c2SKris Buschelman /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 323824c233c2SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 323924c233c2SKris Buschelman SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 324024c233c2SKris Buschelman SSE_INLINE_END_2 324124c233c2SKris Buschelman v = aa + ai16 + 16; 324224c233c2SKris Buschelman idt -= 4; 324324c233c2SKris Buschelman } 324424c233c2SKris Buschelman 324524c233c2SKris Buschelman ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 324624c233c2SKris Buschelman ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 32471ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 32481ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3249dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 325024c233c2SKris Buschelman SSE_SCOPE_END; 325124c233c2SKris Buschelman PetscFunctionReturn(0); 325224c233c2SKris Buschelman } 325324c233c2SKris Buschelman 325424c233c2SKris Buschelman #endif 32550ef38995SBarry Smith 32560ef38995SBarry Smith 32574e2b4712SSatish Balay /* 32584e2b4712SSatish Balay Special case where the matrix was ILU(0) factored in the natural 32594e2b4712SSatish Balay ordering. This eliminates the need for the column and row permutation. 32604e2b4712SSatish Balay */ 32614a2ae208SSatish Balay #undef __FUNCT__ 32624a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 3263dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 32644e2b4712SSatish Balay { 32654e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3266356650c2SBarry Smith PetscInt n=a->mbs; 3267356650c2SBarry Smith const PetscInt *ai=a->i,*aj=a->j; 3268dfbe8321SBarry Smith PetscErrorCode ierr; 3269356650c2SBarry Smith const PetscInt *diag = a->diag; 3270d9fead3dSBarry Smith const MatScalar *aa=a->a; 3271d9fead3dSBarry Smith PetscScalar *x; 3272d9fead3dSBarry Smith const PetscScalar *b; 32734e2b4712SSatish Balay 32744e2b4712SSatish Balay PetscFunctionBegin; 3275d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 32761ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 32774e2b4712SSatish Balay 3278aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 32792853dc0eSBarry Smith { 328087828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 32812853dc0eSBarry Smith fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 32822853dc0eSBarry Smith } 3283aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 32842853dc0eSBarry Smith { 328587828ca2SBarry Smith static PetscScalar w[2000]; /* very BAD need to fix */ 32862853dc0eSBarry Smith fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 32872853dc0eSBarry Smith } 3288aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 32892853dc0eSBarry Smith fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3290e1293385SBarry Smith #else 329130d4dcafSBarry Smith { 329287828ca2SBarry Smith PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3293d9fead3dSBarry Smith const MatScalar *v; 3294356650c2SBarry Smith PetscInt jdx,idt,idx,nz,i,ai16; 3295356650c2SBarry Smith const PetscInt *vi; 3296e1293385SBarry Smith 32974e2b4712SSatish Balay /* forward solve the lower triangular */ 32984e2b4712SSatish Balay idx = 0; 3299e1293385SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 33004e2b4712SSatish Balay for (i=1; i<n; i++) { 33014e2b4712SSatish Balay v = aa + 16*ai[i]; 33024e2b4712SSatish Balay vi = aj + ai[i]; 33034e2b4712SSatish Balay nz = diag[i] - ai[i]; 3304e1293385SBarry Smith idx += 4; 3305f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 33064e2b4712SSatish Balay while (nz--) { 33074e2b4712SSatish Balay jdx = 4*(*vi++); 33084e2b4712SSatish Balay x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3309f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3310f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3311f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3312f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 33134e2b4712SSatish Balay v += 16; 33144e2b4712SSatish Balay } 3315f1af5d2fSBarry Smith x[idx] = s1; 3316f1af5d2fSBarry Smith x[1+idx] = s2; 3317f1af5d2fSBarry Smith x[2+idx] = s3; 3318f1af5d2fSBarry Smith x[3+idx] = s4; 33194e2b4712SSatish Balay } 33204e2b4712SSatish Balay /* backward solve the upper triangular */ 33214e555682SBarry Smith idt = 4*(n-1); 33224e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 33234e555682SBarry Smith ai16 = 16*diag[i]; 33244e555682SBarry Smith v = aa + ai16 + 16; 33254e2b4712SSatish Balay vi = aj + diag[i] + 1; 33264e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 3327f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 3328f1af5d2fSBarry Smith s3 = x[2+idt];s4 = x[3+idt]; 33294e2b4712SSatish Balay while (nz--) { 33304e2b4712SSatish Balay idx = 4*(*vi++); 33314e2b4712SSatish Balay x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3332f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3333f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3334f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3335f1af5d2fSBarry Smith s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 33364e2b4712SSatish Balay v += 16; 33374e2b4712SSatish Balay } 33384e555682SBarry Smith v = aa + ai16; 3339f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3340f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3341f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3342f1af5d2fSBarry Smith x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3343329f5518SBarry Smith idt -= 4; 33444e2b4712SSatish Balay } 334530d4dcafSBarry Smith } 3346e1293385SBarry Smith #endif 33474e2b4712SSatish Balay 3348d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 33491ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3350dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 33514e2b4712SSatish Balay PetscFunctionReturn(0); 33524e2b4712SSatish Balay } 33534e2b4712SSatish Balay 3354b2b2dd24SShri Abhyankar #undef __FUNCT__ 3355a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3356a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3357b2b2dd24SShri Abhyankar { 3358b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3359b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3360b2b2dd24SShri Abhyankar PetscErrorCode ierr; 3361b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 3362b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3363b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 3364b2b2dd24SShri Abhyankar PetscScalar *x; 3365b2b2dd24SShri Abhyankar const PetscScalar *b; 3366b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3367cee9d6f2SShri Abhyankar 3368b2b2dd24SShri Abhyankar PetscFunctionBegin; 3369b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3370b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3371b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 3372b2b2dd24SShri Abhyankar idx = 0; 3373b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3374b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 3375b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 3376b2b2dd24SShri Abhyankar vi = aj + ai[i]; 3377b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 3378b2b2dd24SShri Abhyankar idx = bs*i; 3379b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3380b2b2dd24SShri Abhyankar for(k=0;k<nz;k++) { 3381b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 3382b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3383b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3384b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3385b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3386b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3387b2b2dd24SShri Abhyankar 3388b2b2dd24SShri Abhyankar v += bs2; 3389b2b2dd24SShri Abhyankar } 3390b2b2dd24SShri Abhyankar 3391b2b2dd24SShri Abhyankar x[idx] = s1; 3392b2b2dd24SShri Abhyankar x[1+idx] = s2; 3393b2b2dd24SShri Abhyankar x[2+idx] = s3; 3394b2b2dd24SShri Abhyankar x[3+idx] = s4; 3395b2b2dd24SShri Abhyankar } 3396b2b2dd24SShri Abhyankar 3397b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 3398b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 3399b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 3400b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 3401b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 3402b2b2dd24SShri Abhyankar idt = bs*i; 3403b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3404b2b2dd24SShri Abhyankar 3405b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 3406b2b2dd24SShri Abhyankar idx = bs*vi[k]; 3407b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3408b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3409b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3410b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3411b2b2dd24SShri Abhyankar s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3412b2b2dd24SShri Abhyankar 3413b2b2dd24SShri Abhyankar v += bs2; 3414b2b2dd24SShri Abhyankar } 3415b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 3416b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3417b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3418b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3419b2b2dd24SShri Abhyankar x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3420b2b2dd24SShri Abhyankar 3421b2b2dd24SShri Abhyankar } 3422b2b2dd24SShri Abhyankar 3423b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3424b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3425b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3426b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 3427b2b2dd24SShri Abhyankar } 3428cee9d6f2SShri Abhyankar 3429cee9d6f2SShri Abhyankar #undef __FUNCT__ 3430f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3431dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3432f26ec98cSKris Buschelman { 3433f26ec98cSKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3434690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3435dfbe8321SBarry Smith PetscErrorCode ierr; 3436690b6cddSBarry Smith PetscInt *diag = a->diag; 3437f26ec98cSKris Buschelman MatScalar *aa=a->a; 3438f26ec98cSKris Buschelman PetscScalar *x,*b; 3439f26ec98cSKris Buschelman 3440f26ec98cSKris Buschelman PetscFunctionBegin; 34411ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 34421ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3443f26ec98cSKris Buschelman 3444f26ec98cSKris Buschelman { 3445f26ec98cSKris Buschelman MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3446f26ec98cSKris Buschelman MatScalar *v,*t=(MatScalar *)x; 3447690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3448f26ec98cSKris Buschelman 3449f26ec98cSKris Buschelman /* forward solve the lower triangular */ 3450f26ec98cSKris Buschelman idx = 0; 3451f26ec98cSKris Buschelman t[0] = (MatScalar)b[0]; 3452f26ec98cSKris Buschelman t[1] = (MatScalar)b[1]; 3453f26ec98cSKris Buschelman t[2] = (MatScalar)b[2]; 3454f26ec98cSKris Buschelman t[3] = (MatScalar)b[3]; 3455f26ec98cSKris Buschelman for (i=1; i<n; i++) { 3456f26ec98cSKris Buschelman v = aa + 16*ai[i]; 3457f26ec98cSKris Buschelman vi = aj + ai[i]; 3458f26ec98cSKris Buschelman nz = diag[i] - ai[i]; 3459f26ec98cSKris Buschelman idx += 4; 3460f26ec98cSKris Buschelman s1 = (MatScalar)b[idx]; 3461f26ec98cSKris Buschelman s2 = (MatScalar)b[1+idx]; 3462f26ec98cSKris Buschelman s3 = (MatScalar)b[2+idx]; 3463f26ec98cSKris Buschelman s4 = (MatScalar)b[3+idx]; 3464f26ec98cSKris Buschelman while (nz--) { 3465f26ec98cSKris Buschelman jdx = 4*(*vi++); 3466f26ec98cSKris Buschelman x1 = t[jdx]; 3467f26ec98cSKris Buschelman x2 = t[1+jdx]; 3468f26ec98cSKris Buschelman x3 = t[2+jdx]; 3469f26ec98cSKris Buschelman x4 = t[3+jdx]; 3470f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3471f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3472f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3473f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3474f26ec98cSKris Buschelman v += 16; 3475f26ec98cSKris Buschelman } 3476f26ec98cSKris Buschelman t[idx] = s1; 3477f26ec98cSKris Buschelman t[1+idx] = s2; 3478f26ec98cSKris Buschelman t[2+idx] = s3; 3479f26ec98cSKris Buschelman t[3+idx] = s4; 3480f26ec98cSKris Buschelman } 3481f26ec98cSKris Buschelman /* backward solve the upper triangular */ 3482f26ec98cSKris Buschelman idt = 4*(n-1); 3483f26ec98cSKris Buschelman for (i=n-1; i>=0; i--){ 3484f26ec98cSKris Buschelman ai16 = 16*diag[i]; 3485f26ec98cSKris Buschelman v = aa + ai16 + 16; 3486f26ec98cSKris Buschelman vi = aj + diag[i] + 1; 3487f26ec98cSKris Buschelman nz = ai[i+1] - diag[i] - 1; 3488f26ec98cSKris Buschelman s1 = t[idt]; 3489f26ec98cSKris Buschelman s2 = t[1+idt]; 3490f26ec98cSKris Buschelman s3 = t[2+idt]; 3491f26ec98cSKris Buschelman s4 = t[3+idt]; 3492f26ec98cSKris Buschelman while (nz--) { 3493f26ec98cSKris Buschelman idx = 4*(*vi++); 3494f26ec98cSKris Buschelman x1 = (MatScalar)x[idx]; 3495f26ec98cSKris Buschelman x2 = (MatScalar)x[1+idx]; 3496f26ec98cSKris Buschelman x3 = (MatScalar)x[2+idx]; 3497f26ec98cSKris Buschelman x4 = (MatScalar)x[3+idx]; 3498f26ec98cSKris Buschelman s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3499f26ec98cSKris Buschelman s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3500f26ec98cSKris Buschelman s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3501f26ec98cSKris Buschelman s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3502f26ec98cSKris Buschelman v += 16; 3503f26ec98cSKris Buschelman } 3504f26ec98cSKris Buschelman v = aa + ai16; 3505f26ec98cSKris Buschelman x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3506f26ec98cSKris Buschelman x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3507f26ec98cSKris Buschelman x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3508f26ec98cSKris Buschelman x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3509f26ec98cSKris Buschelman idt -= 4; 3510f26ec98cSKris Buschelman } 3511f26ec98cSKris Buschelman } 3512f26ec98cSKris Buschelman 35131ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 35141ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3515dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3516f26ec98cSKris Buschelman PetscFunctionReturn(0); 3517f26ec98cSKris Buschelman } 3518f26ec98cSKris Buschelman 35193660e330SKris Buschelman #if defined (PETSC_HAVE_SSE) 35203660e330SKris Buschelman 35213660e330SKris Buschelman #include PETSC_HAVE_SSE 35223660e330SKris Buschelman #undef __FUNCT__ 35237cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3524dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 35253660e330SKris Buschelman { 35263660e330SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 35272aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)a->j; 3528dfbe8321SBarry Smith PetscErrorCode ierr; 3529dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 35303660e330SKris Buschelman MatScalar *aa=a->a; 353187828ca2SBarry Smith PetscScalar *x,*b; 35323660e330SKris Buschelman 35333660e330SKris Buschelman PetscFunctionBegin; 35343660e330SKris Buschelman SSE_SCOPE_BEGIN; 35353660e330SKris Buschelman /* 35363660e330SKris Buschelman Note: This code currently uses demotion of double 35373660e330SKris Buschelman to float when performing the mixed-mode computation. 35383660e330SKris Buschelman This may not be numerically reasonable for all applications. 35393660e330SKris Buschelman */ 35403660e330SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 35413660e330SKris Buschelman 35421ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 35431ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 35443660e330SKris Buschelman { 3545eb05f457SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 3546eb05f457SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 35472aa5897fSKris Buschelman int nz,i,idt,ai16; 35482aa5897fSKris Buschelman unsigned int jdx,idx; 35492aa5897fSKris Buschelman unsigned short *vi; 3550eb05f457SKris Buschelman /* Forward solve the lower triangular factor. */ 35513660e330SKris Buschelman 3552eb05f457SKris Buschelman /* First block is the identity. */ 35533660e330SKris Buschelman idx = 0; 3554eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 35552aa5897fSKris Buschelman v = aa + 16*((unsigned int)ai[1]); 35563660e330SKris Buschelman 35573660e330SKris Buschelman for (i=1; i<n;) { 35583660e330SKris Buschelman PREFETCH_NTA(&v[8]); 35593660e330SKris Buschelman vi = aj + ai[i]; 35603660e330SKris Buschelman nz = diag[i] - ai[i]; 35613660e330SKris Buschelman idx += 4; 35623660e330SKris Buschelman 3563eb05f457SKris Buschelman /* Demote RHS from double to float. */ 3564eb05f457SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3565eb05f457SKris Buschelman LOAD_PS(&t[idx],XMM7); 35663660e330SKris Buschelman 35673660e330SKris Buschelman while (nz--) { 35683660e330SKris Buschelman PREFETCH_NTA(&v[16]); 35692aa5897fSKris Buschelman jdx = 4*((unsigned int)(*vi++)); 35703660e330SKris Buschelman 35713660e330SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 3572eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 35733660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 35743660e330SKris Buschelman 35753660e330SKris Buschelman /* First Column */ 35763660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 35773660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 35783660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 35793660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 35803660e330SKris Buschelman 35813660e330SKris Buschelman /* Second Column */ 35823660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 35833660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 35843660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 35853660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 35863660e330SKris Buschelman 35873660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 35883660e330SKris Buschelman 35893660e330SKris Buschelman /* Third Column */ 35903660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 35913660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 35923660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 35933660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 35943660e330SKris Buschelman 35953660e330SKris Buschelman /* Fourth Column */ 35963660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 35973660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 35983660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 35993660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 36003660e330SKris Buschelman SSE_INLINE_END_2 36013660e330SKris Buschelman 36023660e330SKris Buschelman v += 16; 36033660e330SKris Buschelman } 36043660e330SKris Buschelman v = aa + 16*ai[++i]; 36053660e330SKris Buschelman PREFETCH_NTA(v); 3606eb05f457SKris Buschelman STORE_PS(&t[idx],XMM7); 36073660e330SKris Buschelman } 3608eb05f457SKris Buschelman 3609eb05f457SKris Buschelman /* Backward solve the upper triangular factor.*/ 3610eb05f457SKris Buschelman 36113660e330SKris Buschelman idt = 4*(n-1); 36123660e330SKris Buschelman ai16 = 16*diag[n-1]; 36133660e330SKris Buschelman v = aa + ai16 + 16; 36143660e330SKris Buschelman for (i=n-1; i>=0;){ 36153660e330SKris Buschelman PREFETCH_NTA(&v[8]); 36163660e330SKris Buschelman vi = aj + diag[i] + 1; 36173660e330SKris Buschelman nz = ai[i+1] - diag[i] - 1; 36183660e330SKris Buschelman 3619eb05f457SKris Buschelman LOAD_PS(&t[idt],XMM7); 36203660e330SKris Buschelman 36213660e330SKris Buschelman while (nz--) { 36223660e330SKris Buschelman PREFETCH_NTA(&v[16]); 36232aa5897fSKris Buschelman idx = 4*((unsigned int)(*vi++)); 36243660e330SKris Buschelman 36253660e330SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 3626eb05f457SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 36273660e330SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 36283660e330SKris Buschelman 36293660e330SKris Buschelman /* First Column */ 36303660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 36313660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 36323660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 36333660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 36343660e330SKris Buschelman 36353660e330SKris Buschelman /* Second Column */ 36363660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 36373660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 36383660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 36393660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 36403660e330SKris Buschelman 36413660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 36423660e330SKris Buschelman 36433660e330SKris Buschelman /* Third Column */ 36443660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 36453660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 36463660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 36473660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 36483660e330SKris Buschelman 36493660e330SKris Buschelman /* Fourth Column */ 36503660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 36513660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 36523660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 36533660e330SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 36543660e330SKris Buschelman SSE_INLINE_END_2 36553660e330SKris Buschelman v += 16; 36563660e330SKris Buschelman } 36573660e330SKris Buschelman v = aa + ai16; 36583660e330SKris Buschelman ai16 = 16*diag[--i]; 36593660e330SKris Buschelman PREFETCH_NTA(aa+ai16+16); 36603660e330SKris Buschelman /* 36613660e330SKris Buschelman Scale the result by the diagonal 4x4 block, 36623660e330SKris Buschelman which was inverted as part of the factorization 36633660e330SKris Buschelman */ 3664eb05f457SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 36653660e330SKris Buschelman /* First Column */ 36663660e330SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 36673660e330SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 36683660e330SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 36693660e330SKris Buschelman 36703660e330SKris Buschelman /* Second Column */ 36713660e330SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 36723660e330SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 36733660e330SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 36743660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 36753660e330SKris Buschelman 36763660e330SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 36773660e330SKris Buschelman 36783660e330SKris Buschelman /* Third Column */ 36793660e330SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 36803660e330SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 36813660e330SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 36823660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 36833660e330SKris Buschelman 36843660e330SKris Buschelman /* Fourth Column */ 36853660e330SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 36863660e330SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 36873660e330SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 36883660e330SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 36893660e330SKris Buschelman 36903660e330SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 36913660e330SKris Buschelman SSE_INLINE_END_3 36923660e330SKris Buschelman 36933660e330SKris Buschelman v = aa + ai16 + 16; 36943660e330SKris Buschelman idt -= 4; 36953660e330SKris Buschelman } 3696eb05f457SKris Buschelman 3697eb05f457SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 3698eb05f457SKris Buschelman idt = 4*(n-1); 3699eb05f457SKris Buschelman for (i=n-1;i>=0;i--) { 3700eb05f457SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3701eb05f457SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3702eb05f457SKris Buschelman PetscScalar *xtemp=&x[idt]; 3703eb05f457SKris Buschelman MatScalar *ttemp=&t[idt]; 3704eb05f457SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 3705eb05f457SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 3706eb05f457SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 3707eb05f457SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 370854693613SKris Buschelman idt -= 4; 37093660e330SKris Buschelman } 3710eb05f457SKris Buschelman 3711eb05f457SKris Buschelman } /* End of artificial scope. */ 37121ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 37131ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3714dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 37153660e330SKris Buschelman SSE_SCOPE_END; 37163660e330SKris Buschelman PetscFunctionReturn(0); 37173660e330SKris Buschelman } 37183660e330SKris Buschelman 37197cf1b8d3SKris Buschelman #undef __FUNCT__ 37207cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3721dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 37227cf1b8d3SKris Buschelman { 37237cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 37247cf1b8d3SKris Buschelman int *aj=a->j; 3725dfbe8321SBarry Smith PetscErrorCode ierr; 3726dfbe8321SBarry Smith int *ai=a->i,n=a->mbs,*diag = a->diag; 37277cf1b8d3SKris Buschelman MatScalar *aa=a->a; 37287cf1b8d3SKris Buschelman PetscScalar *x,*b; 37297cf1b8d3SKris Buschelman 37307cf1b8d3SKris Buschelman PetscFunctionBegin; 37317cf1b8d3SKris Buschelman SSE_SCOPE_BEGIN; 37327cf1b8d3SKris Buschelman /* 37337cf1b8d3SKris Buschelman Note: This code currently uses demotion of double 37347cf1b8d3SKris Buschelman to float when performing the mixed-mode computation. 37357cf1b8d3SKris Buschelman This may not be numerically reasonable for all applications. 37367cf1b8d3SKris Buschelman */ 37377cf1b8d3SKris Buschelman PREFETCH_NTA(aa+16*ai[1]); 37387cf1b8d3SKris Buschelman 37391ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 37401ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 37417cf1b8d3SKris Buschelman { 37427cf1b8d3SKris Buschelman /* x will first be computed in single precision then promoted inplace to double */ 37437cf1b8d3SKris Buschelman MatScalar *v,*t=(MatScalar *)x; 37447cf1b8d3SKris Buschelman int nz,i,idt,ai16; 37457cf1b8d3SKris Buschelman int jdx,idx; 37467cf1b8d3SKris Buschelman int *vi; 37477cf1b8d3SKris Buschelman /* Forward solve the lower triangular factor. */ 37487cf1b8d3SKris Buschelman 37497cf1b8d3SKris Buschelman /* First block is the identity. */ 37507cf1b8d3SKris Buschelman idx = 0; 37517cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(t,b); 37527cf1b8d3SKris Buschelman v = aa + 16*ai[1]; 37537cf1b8d3SKris Buschelman 37547cf1b8d3SKris Buschelman for (i=1; i<n;) { 37557cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 37567cf1b8d3SKris Buschelman vi = aj + ai[i]; 37577cf1b8d3SKris Buschelman nz = diag[i] - ai[i]; 37587cf1b8d3SKris Buschelman idx += 4; 37597cf1b8d3SKris Buschelman 37607cf1b8d3SKris Buschelman /* Demote RHS from double to float. */ 37617cf1b8d3SKris Buschelman CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 37627cf1b8d3SKris Buschelman LOAD_PS(&t[idx],XMM7); 37637cf1b8d3SKris Buschelman 37647cf1b8d3SKris Buschelman while (nz--) { 37657cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 37667cf1b8d3SKris Buschelman jdx = 4*(*vi++); 37677cf1b8d3SKris Buschelman /* jdx = *vi++; */ 37687cf1b8d3SKris Buschelman 37697cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector product with negative accumulation: */ 37707cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[jdx],v) 37717cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 37727cf1b8d3SKris Buschelman 37737cf1b8d3SKris Buschelman /* First Column */ 37747cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 37757cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 37767cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 37777cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 37787cf1b8d3SKris Buschelman 37797cf1b8d3SKris Buschelman /* Second Column */ 37807cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 37817cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 37827cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 37837cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 37847cf1b8d3SKris Buschelman 37857cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 37867cf1b8d3SKris Buschelman 37877cf1b8d3SKris Buschelman /* Third Column */ 37887cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 37897cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 37907cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 37917cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 37927cf1b8d3SKris Buschelman 37937cf1b8d3SKris Buschelman /* Fourth Column */ 37947cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 37957cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 37967cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 37977cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 37987cf1b8d3SKris Buschelman SSE_INLINE_END_2 37997cf1b8d3SKris Buschelman 38007cf1b8d3SKris Buschelman v += 16; 38017cf1b8d3SKris Buschelman } 38027cf1b8d3SKris Buschelman v = aa + 16*ai[++i]; 38037cf1b8d3SKris Buschelman PREFETCH_NTA(v); 38047cf1b8d3SKris Buschelman STORE_PS(&t[idx],XMM7); 38057cf1b8d3SKris Buschelman } 38067cf1b8d3SKris Buschelman 38077cf1b8d3SKris Buschelman /* Backward solve the upper triangular factor.*/ 38087cf1b8d3SKris Buschelman 38097cf1b8d3SKris Buschelman idt = 4*(n-1); 38107cf1b8d3SKris Buschelman ai16 = 16*diag[n-1]; 38117cf1b8d3SKris Buschelman v = aa + ai16 + 16; 38127cf1b8d3SKris Buschelman for (i=n-1; i>=0;){ 38137cf1b8d3SKris Buschelman PREFETCH_NTA(&v[8]); 38147cf1b8d3SKris Buschelman vi = aj + diag[i] + 1; 38157cf1b8d3SKris Buschelman nz = ai[i+1] - diag[i] - 1; 38167cf1b8d3SKris Buschelman 38177cf1b8d3SKris Buschelman LOAD_PS(&t[idt],XMM7); 38187cf1b8d3SKris Buschelman 38197cf1b8d3SKris Buschelman while (nz--) { 38207cf1b8d3SKris Buschelman PREFETCH_NTA(&v[16]); 38217cf1b8d3SKris Buschelman idx = 4*(*vi++); 38227cf1b8d3SKris Buschelman /* idx = *vi++; */ 38237cf1b8d3SKris Buschelman 38247cf1b8d3SKris Buschelman /* 4x4 Matrix-Vector Product with negative accumulation: */ 38257cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_2(&t[idx],v) 38267cf1b8d3SKris Buschelman SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 38277cf1b8d3SKris Buschelman 38287cf1b8d3SKris Buschelman /* First Column */ 38297cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM6) 38307cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 38317cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 38327cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM0) 38337cf1b8d3SKris Buschelman 38347cf1b8d3SKris Buschelman /* Second Column */ 38357cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM6) 38367cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 38377cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 38387cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM1) 38397cf1b8d3SKris Buschelman 38407cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 38417cf1b8d3SKris Buschelman 38427cf1b8d3SKris Buschelman /* Third Column */ 38437cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM6) 38447cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 38457cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 38467cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM2) 38477cf1b8d3SKris Buschelman 38487cf1b8d3SKris Buschelman /* Fourth Column */ 38497cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM6) 38507cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 38517cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 38527cf1b8d3SKris Buschelman SSE_SUB_PS(XMM7,XMM3) 38537cf1b8d3SKris Buschelman SSE_INLINE_END_2 38547cf1b8d3SKris Buschelman v += 16; 38557cf1b8d3SKris Buschelman } 38567cf1b8d3SKris Buschelman v = aa + ai16; 38577cf1b8d3SKris Buschelman ai16 = 16*diag[--i]; 38587cf1b8d3SKris Buschelman PREFETCH_NTA(aa+ai16+16); 38597cf1b8d3SKris Buschelman /* 38607cf1b8d3SKris Buschelman Scale the result by the diagonal 4x4 block, 38617cf1b8d3SKris Buschelman which was inverted as part of the factorization 38627cf1b8d3SKris Buschelman */ 38637cf1b8d3SKris Buschelman SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 38647cf1b8d3SKris Buschelman /* First Column */ 38657cf1b8d3SKris Buschelman SSE_COPY_PS(XMM0,XMM7) 38667cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM0,XMM0,0x00) 38677cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 38687cf1b8d3SKris Buschelman 38697cf1b8d3SKris Buschelman /* Second Column */ 38707cf1b8d3SKris Buschelman SSE_COPY_PS(XMM1,XMM7) 38717cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM1,XMM1,0x55) 38727cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 38737cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM1) 38747cf1b8d3SKris Buschelman 38757cf1b8d3SKris Buschelman SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 38767cf1b8d3SKris Buschelman 38777cf1b8d3SKris Buschelman /* Third Column */ 38787cf1b8d3SKris Buschelman SSE_COPY_PS(XMM2,XMM7) 38797cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM2,XMM2,0xAA) 38807cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 38817cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM2) 38827cf1b8d3SKris Buschelman 38837cf1b8d3SKris Buschelman /* Fourth Column */ 38847cf1b8d3SKris Buschelman SSE_COPY_PS(XMM3,XMM7) 38857cf1b8d3SKris Buschelman SSE_SHUFFLE(XMM3,XMM3,0xFF) 38867cf1b8d3SKris Buschelman SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 38877cf1b8d3SKris Buschelman SSE_ADD_PS(XMM0,XMM3) 38887cf1b8d3SKris Buschelman 38897cf1b8d3SKris Buschelman SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 38907cf1b8d3SKris Buschelman SSE_INLINE_END_3 38917cf1b8d3SKris Buschelman 38927cf1b8d3SKris Buschelman v = aa + ai16 + 16; 38937cf1b8d3SKris Buschelman idt -= 4; 38947cf1b8d3SKris Buschelman } 38957cf1b8d3SKris Buschelman 38967cf1b8d3SKris Buschelman /* Convert t from single precision back to double precision (inplace)*/ 38977cf1b8d3SKris Buschelman idt = 4*(n-1); 38987cf1b8d3SKris Buschelman for (i=n-1;i>=0;i--) { 38997cf1b8d3SKris Buschelman /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 39007cf1b8d3SKris Buschelman /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 39017cf1b8d3SKris Buschelman PetscScalar *xtemp=&x[idt]; 39027cf1b8d3SKris Buschelman MatScalar *ttemp=&t[idt]; 39037cf1b8d3SKris Buschelman xtemp[3] = (PetscScalar)ttemp[3]; 39047cf1b8d3SKris Buschelman xtemp[2] = (PetscScalar)ttemp[2]; 39057cf1b8d3SKris Buschelman xtemp[1] = (PetscScalar)ttemp[1]; 39067cf1b8d3SKris Buschelman xtemp[0] = (PetscScalar)ttemp[0]; 39077cf1b8d3SKris Buschelman idt -= 4; 39087cf1b8d3SKris Buschelman } 39097cf1b8d3SKris Buschelman 39107cf1b8d3SKris Buschelman } /* End of artificial scope. */ 39111ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 39121ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3913dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 39147cf1b8d3SKris Buschelman SSE_SCOPE_END; 39157cf1b8d3SKris Buschelman PetscFunctionReturn(0); 39167cf1b8d3SKris Buschelman } 39177cf1b8d3SKris Buschelman 39183660e330SKris Buschelman #endif 39198f690400SShri Abhyankar 39204a2ae208SSatish Balay #undef __FUNCT__ 39214a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3922dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 39234e2b4712SSatish Balay { 39244e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 39254e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 39266849ba73SBarry Smith PetscErrorCode ierr; 39275d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 39285d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3929d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 3930d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3931d9fead3dSBarry Smith const PetscScalar *b; 39324e2b4712SSatish Balay 39334e2b4712SSatish Balay PetscFunctionBegin; 3934d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 39351ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3936f1af5d2fSBarry Smith t = a->solve_work; 39374e2b4712SSatish Balay 39384e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 39394e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 39404e2b4712SSatish Balay 39414e2b4712SSatish Balay /* forward solve the lower triangular */ 39424e2b4712SSatish Balay idx = 3*(*r++); 3943f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 39444e2b4712SSatish Balay for (i=1; i<n; i++) { 39454e2b4712SSatish Balay v = aa + 9*ai[i]; 39464e2b4712SSatish Balay vi = aj + ai[i]; 39474e2b4712SSatish Balay nz = diag[i] - ai[i]; 39484e2b4712SSatish Balay idx = 3*(*r++); 3949f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 39504e2b4712SSatish Balay while (nz--) { 39514e2b4712SSatish Balay idx = 3*(*vi++); 3952f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3953f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3954f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3955f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 39564e2b4712SSatish Balay v += 9; 39574e2b4712SSatish Balay } 39584e2b4712SSatish Balay idx = 3*i; 3959f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 39604e2b4712SSatish Balay } 39614e2b4712SSatish Balay /* backward solve the upper triangular */ 39624e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 39634e2b4712SSatish Balay v = aa + 9*diag[i] + 9; 39644e2b4712SSatish Balay vi = aj + diag[i] + 1; 39654e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 39664e2b4712SSatish Balay idt = 3*i; 3967f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 39684e2b4712SSatish Balay while (nz--) { 39694e2b4712SSatish Balay idx = 3*(*vi++); 3970f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3971f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3972f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3973f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 39744e2b4712SSatish Balay v += 9; 39754e2b4712SSatish Balay } 39764e2b4712SSatish Balay idc = 3*(*c--); 39774e2b4712SSatish Balay v = aa + 9*diag[i]; 3978f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3979f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3980f1af5d2fSBarry Smith x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 39814e2b4712SSatish Balay } 39824e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 39834e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3984d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 39851ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3986dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 39874e2b4712SSatish Balay PetscFunctionReturn(0); 39884e2b4712SSatish Balay } 39894e2b4712SSatish Balay 39900c4413a7SShri Abhyankar #undef __FUNCT__ 3991a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 3992a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 39930c4413a7SShri Abhyankar { 39940c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 39950c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 39960c4413a7SShri Abhyankar PetscErrorCode ierr; 39970c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 39980c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 39990c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 40000c4413a7SShri Abhyankar PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 40010c4413a7SShri Abhyankar const PetscScalar *b; 40020c4413a7SShri Abhyankar 40030c4413a7SShri Abhyankar PetscFunctionBegin; 40040c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 40050c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 40060c4413a7SShri Abhyankar t = a->solve_work; 40070c4413a7SShri Abhyankar 40080c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 40090c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 40100c4413a7SShri Abhyankar 40110c4413a7SShri Abhyankar /* forward solve the lower triangular */ 40120c4413a7SShri Abhyankar idx = 3*r[0]; 40130c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 40140c4413a7SShri Abhyankar for (i=1; i<n; i++) { 40150c4413a7SShri Abhyankar v = aa + 9*ai[i]; 40160c4413a7SShri Abhyankar vi = aj + ai[i]; 40170c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 40180c4413a7SShri Abhyankar idx = 3*r[i]; 40190c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 40200c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 40210c4413a7SShri Abhyankar idx = 3*vi[m]; 40220c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 40230c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 40240c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 40250c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 40260c4413a7SShri Abhyankar v += 9; 40270c4413a7SShri Abhyankar } 40280c4413a7SShri Abhyankar idx = 3*i; 40290c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 40300c4413a7SShri Abhyankar } 40310c4413a7SShri Abhyankar /* backward solve the upper triangular */ 40320c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 40330c4413a7SShri Abhyankar v = aa + 9*(adiag[i+1]+1); 40340c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 40350c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 40360c4413a7SShri Abhyankar idt = 3*i; 40370c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 40380c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 40390c4413a7SShri Abhyankar idx = 3*vi[m]; 40400c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 40410c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 40420c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 40430c4413a7SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 40440c4413a7SShri Abhyankar v += 9; 40450c4413a7SShri Abhyankar } 40460c4413a7SShri Abhyankar idc = 3*c[i]; 40470c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 40480c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 40490c4413a7SShri Abhyankar x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 40500c4413a7SShri Abhyankar } 40510c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 40520c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 40530c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 40540c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 40550c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 40560c4413a7SShri Abhyankar PetscFunctionReturn(0); 40570c4413a7SShri Abhyankar } 40580c4413a7SShri Abhyankar 405915091d37SBarry Smith /* 406015091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 406115091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 406215091d37SBarry Smith */ 40634a2ae208SSatish Balay #undef __FUNCT__ 40644a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4065dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 406615091d37SBarry Smith { 406715091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4068690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4069dfbe8321SBarry Smith PetscErrorCode ierr; 4070690b6cddSBarry Smith PetscInt *diag = a->diag; 4071d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4072d9fead3dSBarry Smith PetscScalar *x,s1,s2,s3,x1,x2,x3; 4073d9fead3dSBarry Smith const PetscScalar *b; 4074690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 407515091d37SBarry Smith 407615091d37SBarry Smith PetscFunctionBegin; 4077d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 40781ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 407915091d37SBarry Smith 408015091d37SBarry Smith /* forward solve the lower triangular */ 408115091d37SBarry Smith idx = 0; 408215091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 408315091d37SBarry Smith for (i=1; i<n; i++) { 408415091d37SBarry Smith v = aa + 9*ai[i]; 408515091d37SBarry Smith vi = aj + ai[i]; 408615091d37SBarry Smith nz = diag[i] - ai[i]; 408715091d37SBarry Smith idx += 3; 4088f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 408915091d37SBarry Smith while (nz--) { 409015091d37SBarry Smith jdx = 3*(*vi++); 409115091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4092f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4093f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4094f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 409515091d37SBarry Smith v += 9; 409615091d37SBarry Smith } 4097f1af5d2fSBarry Smith x[idx] = s1; 4098f1af5d2fSBarry Smith x[1+idx] = s2; 4099f1af5d2fSBarry Smith x[2+idx] = s3; 410015091d37SBarry Smith } 410115091d37SBarry Smith /* backward solve the upper triangular */ 410215091d37SBarry Smith for (i=n-1; i>=0; i--){ 410315091d37SBarry Smith v = aa + 9*diag[i] + 9; 410415091d37SBarry Smith vi = aj + diag[i] + 1; 410515091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 410615091d37SBarry Smith idt = 3*i; 4107f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 4108f1af5d2fSBarry Smith s3 = x[2+idt]; 410915091d37SBarry Smith while (nz--) { 411015091d37SBarry Smith idx = 3*(*vi++); 411115091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4112f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4113f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4114f1af5d2fSBarry Smith s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 411515091d37SBarry Smith v += 9; 411615091d37SBarry Smith } 411715091d37SBarry Smith v = aa + 9*diag[i]; 4118f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4119f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4120f1af5d2fSBarry Smith x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 412115091d37SBarry Smith } 412215091d37SBarry Smith 4123d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 41241ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4125dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 412615091d37SBarry Smith PetscFunctionReturn(0); 412715091d37SBarry Smith } 412815091d37SBarry Smith 4129cee9d6f2SShri Abhyankar #undef __FUNCT__ 4130a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4131a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4132b2b2dd24SShri Abhyankar { 4133b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4134b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4135b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4136b2b2dd24SShri Abhyankar PetscInt idx,jdx,idt; 4137b2b2dd24SShri Abhyankar PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4138b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4139b2b2dd24SShri Abhyankar PetscScalar *x; 4140b2b2dd24SShri Abhyankar const PetscScalar *b; 4141b2b2dd24SShri Abhyankar PetscScalar s1,s2,s3,x1,x2,x3; 4142b2b2dd24SShri Abhyankar 4143b2b2dd24SShri Abhyankar PetscFunctionBegin; 4144b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4145b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4146b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4147b2b2dd24SShri Abhyankar idx = 0; 4148b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4149b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4150b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 4151b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4152b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4153b2b2dd24SShri Abhyankar idx = bs*i; 4154b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4155b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4156b2b2dd24SShri Abhyankar jdx = bs*vi[k]; 4157b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4158b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4159b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4160b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4161b2b2dd24SShri Abhyankar 4162b2b2dd24SShri Abhyankar v += bs2; 4163b2b2dd24SShri Abhyankar } 4164b2b2dd24SShri Abhyankar 4165b2b2dd24SShri Abhyankar x[idx] = s1; 4166b2b2dd24SShri Abhyankar x[1+idx] = s2; 4167b2b2dd24SShri Abhyankar x[2+idx] = s3; 4168b2b2dd24SShri Abhyankar } 4169b2b2dd24SShri Abhyankar 4170b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4171b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4172b2b2dd24SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 4173b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4174b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4175b2b2dd24SShri Abhyankar idt = bs*i; 4176b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4177b2b2dd24SShri Abhyankar 4178b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4179b2b2dd24SShri Abhyankar idx = bs*vi[k]; 4180b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4181b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4182b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4183b2b2dd24SShri Abhyankar s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4184b2b2dd24SShri Abhyankar 4185b2b2dd24SShri Abhyankar v += bs2; 4186b2b2dd24SShri Abhyankar } 4187b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4188b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4189b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4190b2b2dd24SShri Abhyankar x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4191b2b2dd24SShri Abhyankar 4192b2b2dd24SShri Abhyankar } 4193b2b2dd24SShri Abhyankar 4194b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4195b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4196b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4197b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4198b2b2dd24SShri Abhyankar } 4199b2b2dd24SShri Abhyankar 4200b2b2dd24SShri Abhyankar #undef __FUNCT__ 42014a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4202dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 42034e2b4712SSatish Balay { 42044e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 42054e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 42066849ba73SBarry Smith PetscErrorCode ierr; 42075d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 42085d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4209d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4210d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2,*t; 4211d9fead3dSBarry Smith const PetscScalar *b; 42124e2b4712SSatish Balay 42134e2b4712SSatish Balay PetscFunctionBegin; 4214d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42151ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4216f1af5d2fSBarry Smith t = a->solve_work; 42174e2b4712SSatish Balay 42184e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 42194e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 42204e2b4712SSatish Balay 42214e2b4712SSatish Balay /* forward solve the lower triangular */ 42224e2b4712SSatish Balay idx = 2*(*r++); 4223f1af5d2fSBarry Smith t[0] = b[idx]; t[1] = b[1+idx]; 42244e2b4712SSatish Balay for (i=1; i<n; i++) { 42254e2b4712SSatish Balay v = aa + 4*ai[i]; 42264e2b4712SSatish Balay vi = aj + ai[i]; 42274e2b4712SSatish Balay nz = diag[i] - ai[i]; 42284e2b4712SSatish Balay idx = 2*(*r++); 4229f1af5d2fSBarry Smith s1 = b[idx]; s2 = b[1+idx]; 42304e2b4712SSatish Balay while (nz--) { 42314e2b4712SSatish Balay idx = 2*(*vi++); 4232f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4233f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4234f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 42354e2b4712SSatish Balay v += 4; 42364e2b4712SSatish Balay } 42374e2b4712SSatish Balay idx = 2*i; 4238f1af5d2fSBarry Smith t[idx] = s1; t[1+idx] = s2; 42394e2b4712SSatish Balay } 42404e2b4712SSatish Balay /* backward solve the upper triangular */ 42414e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 42424e2b4712SSatish Balay v = aa + 4*diag[i] + 4; 42434e2b4712SSatish Balay vi = aj + diag[i] + 1; 42444e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 42454e2b4712SSatish Balay idt = 2*i; 4246f1af5d2fSBarry Smith s1 = t[idt]; s2 = t[1+idt]; 42474e2b4712SSatish Balay while (nz--) { 42484e2b4712SSatish Balay idx = 2*(*vi++); 4249f1af5d2fSBarry Smith x1 = t[idx]; x2 = t[1+idx]; 4250f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4251f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 42524e2b4712SSatish Balay v += 4; 42534e2b4712SSatish Balay } 42544e2b4712SSatish Balay idc = 2*(*c--); 42554e2b4712SSatish Balay v = aa + 4*diag[i]; 4256f1af5d2fSBarry Smith x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4257f1af5d2fSBarry Smith x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 42584e2b4712SSatish Balay } 42594e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 42604e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4261d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42621ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4263dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 42644e2b4712SSatish Balay PetscFunctionReturn(0); 42654e2b4712SSatish Balay } 42664e2b4712SSatish Balay 42670c4413a7SShri Abhyankar #undef __FUNCT__ 4268a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 4269a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 42700c4413a7SShri Abhyankar { 42710c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 42720c4413a7SShri Abhyankar IS iscol=a->col,isrow=a->row; 42730c4413a7SShri Abhyankar PetscErrorCode ierr; 42740c4413a7SShri Abhyankar PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 42750c4413a7SShri Abhyankar const PetscInt *r,*c,*rout,*cout; 42760c4413a7SShri Abhyankar const MatScalar *aa=a->a,*v; 42770c4413a7SShri Abhyankar PetscScalar *x,s1,s2,x1,x2,*t; 42780c4413a7SShri Abhyankar const PetscScalar *b; 42790c4413a7SShri Abhyankar 42800c4413a7SShri Abhyankar PetscFunctionBegin; 42810c4413a7SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 42820c4413a7SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 42830c4413a7SShri Abhyankar t = a->solve_work; 42840c4413a7SShri Abhyankar 42850c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 42860c4413a7SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 42870c4413a7SShri Abhyankar 42880c4413a7SShri Abhyankar /* forward solve the lower triangular */ 42890c4413a7SShri Abhyankar idx = 2*r[0]; 42900c4413a7SShri Abhyankar t[0] = b[idx]; t[1] = b[1+idx]; 42910c4413a7SShri Abhyankar for (i=1; i<n; i++) { 42920c4413a7SShri Abhyankar v = aa + 4*ai[i]; 42930c4413a7SShri Abhyankar vi = aj + ai[i]; 42940c4413a7SShri Abhyankar nz = ai[i+1] - ai[i]; 42950c4413a7SShri Abhyankar idx = 2*r[i]; 42960c4413a7SShri Abhyankar s1 = b[idx]; s2 = b[1+idx]; 42970c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 42980c4413a7SShri Abhyankar jdx = 2*vi[m]; 42990c4413a7SShri Abhyankar x1 = t[jdx]; x2 = t[1+jdx]; 43000c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 43010c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 43020c4413a7SShri Abhyankar v += 4; 43030c4413a7SShri Abhyankar } 43040c4413a7SShri Abhyankar idx = 2*i; 43050c4413a7SShri Abhyankar t[idx] = s1; t[1+idx] = s2; 43060c4413a7SShri Abhyankar } 43070c4413a7SShri Abhyankar /* backward solve the upper triangular */ 43080c4413a7SShri Abhyankar for (i=n-1; i>=0; i--){ 43090c4413a7SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 43100c4413a7SShri Abhyankar vi = aj + adiag[i+1]+1; 43110c4413a7SShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 43120c4413a7SShri Abhyankar idt = 2*i; 43130c4413a7SShri Abhyankar s1 = t[idt]; s2 = t[1+idt]; 43140c4413a7SShri Abhyankar for(m=0;m<nz;m++){ 43150c4413a7SShri Abhyankar idx = 2*vi[m]; 43160c4413a7SShri Abhyankar x1 = t[idx]; x2 = t[1+idx]; 43170c4413a7SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 43180c4413a7SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 43190c4413a7SShri Abhyankar v += 4; 43200c4413a7SShri Abhyankar } 43210c4413a7SShri Abhyankar idc = 2*c[i]; 43220c4413a7SShri Abhyankar x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 43230c4413a7SShri Abhyankar x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 43240c4413a7SShri Abhyankar } 43250c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 43260c4413a7SShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 43270c4413a7SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 43280c4413a7SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 43290c4413a7SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 43300c4413a7SShri Abhyankar PetscFunctionReturn(0); 43310c4413a7SShri Abhyankar } 43328f690400SShri Abhyankar 433315091d37SBarry Smith /* 433415091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 433515091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 433615091d37SBarry Smith */ 43374a2ae208SSatish Balay #undef __FUNCT__ 43384a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4339dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 434015091d37SBarry Smith { 434115091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4342690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4343dfbe8321SBarry Smith PetscErrorCode ierr; 4344690b6cddSBarry Smith PetscInt *diag = a->diag; 4345d9fead3dSBarry Smith const MatScalar *aa=a->a,*v; 4346d9fead3dSBarry Smith PetscScalar *x,s1,s2,x1,x2; 4347d9fead3dSBarry Smith const PetscScalar *b; 4348690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 434915091d37SBarry Smith 435015091d37SBarry Smith PetscFunctionBegin; 4351d9fead3dSBarry Smith ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 43521ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 435315091d37SBarry Smith 435415091d37SBarry Smith /* forward solve the lower triangular */ 435515091d37SBarry Smith idx = 0; 435615091d37SBarry Smith x[0] = b[0]; x[1] = b[1]; 435715091d37SBarry Smith for (i=1; i<n; i++) { 435815091d37SBarry Smith v = aa + 4*ai[i]; 435915091d37SBarry Smith vi = aj + ai[i]; 436015091d37SBarry Smith nz = diag[i] - ai[i]; 436115091d37SBarry Smith idx += 2; 4362f1af5d2fSBarry Smith s1 = b[idx];s2 = b[1+idx]; 436315091d37SBarry Smith while (nz--) { 436415091d37SBarry Smith jdx = 2*(*vi++); 436515091d37SBarry Smith x1 = x[jdx];x2 = x[1+jdx]; 4366f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4367f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 436815091d37SBarry Smith v += 4; 436915091d37SBarry Smith } 4370f1af5d2fSBarry Smith x[idx] = s1; 4371f1af5d2fSBarry Smith x[1+idx] = s2; 437215091d37SBarry Smith } 437315091d37SBarry Smith /* backward solve the upper triangular */ 437415091d37SBarry Smith for (i=n-1; i>=0; i--){ 437515091d37SBarry Smith v = aa + 4*diag[i] + 4; 437615091d37SBarry Smith vi = aj + diag[i] + 1; 437715091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 437815091d37SBarry Smith idt = 2*i; 4379f1af5d2fSBarry Smith s1 = x[idt]; s2 = x[1+idt]; 438015091d37SBarry Smith while (nz--) { 438115091d37SBarry Smith idx = 2*(*vi++); 438215091d37SBarry Smith x1 = x[idx]; x2 = x[1+idx]; 4383f1af5d2fSBarry Smith s1 -= v[0]*x1 + v[2]*x2; 4384f1af5d2fSBarry Smith s2 -= v[1]*x1 + v[3]*x2; 438515091d37SBarry Smith v += 4; 438615091d37SBarry Smith } 438715091d37SBarry Smith v = aa + 4*diag[i]; 4388f1af5d2fSBarry Smith x[idt] = v[0]*s1 + v[2]*s2; 4389f1af5d2fSBarry Smith x[1+idt] = v[1]*s1 + v[3]*s2; 439015091d37SBarry Smith } 439115091d37SBarry Smith 4392d9fead3dSBarry Smith ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 43931ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4394dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 439515091d37SBarry Smith PetscFunctionReturn(0); 439615091d37SBarry Smith } 439715091d37SBarry Smith 4398cee9d6f2SShri Abhyankar #undef __FUNCT__ 4399a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4400a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4401b2b2dd24SShri Abhyankar { 4402b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4403b2b2dd24SShri Abhyankar PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4404b2b2dd24SShri Abhyankar PetscErrorCode ierr; 4405b2b2dd24SShri Abhyankar PetscInt jdx; 4406b2b2dd24SShri Abhyankar const MatScalar *aa=a->a,*v; 4407b2b2dd24SShri Abhyankar PetscScalar *x,s1,s2,x1,x2; 4408b2b2dd24SShri Abhyankar const PetscScalar *b; 4409b2b2dd24SShri Abhyankar 4410b2b2dd24SShri Abhyankar PetscFunctionBegin; 4411b2b2dd24SShri Abhyankar ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4412b2b2dd24SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4413b2b2dd24SShri Abhyankar /* forward solve the lower triangular */ 4414b2b2dd24SShri Abhyankar idx = 0; 4415b2b2dd24SShri Abhyankar x[0] = b[idx]; x[1] = b[1+idx]; 4416b2b2dd24SShri Abhyankar for (i=1; i<n; i++) { 4417b2b2dd24SShri Abhyankar v = aa + 4*ai[i]; 4418b2b2dd24SShri Abhyankar vi = aj + ai[i]; 4419b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 4420b2b2dd24SShri Abhyankar idx = 2*i; 4421b2b2dd24SShri Abhyankar s1 = b[idx];s2 = b[1+idx]; 4422b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4423b2b2dd24SShri Abhyankar jdx = 2*vi[k]; 4424b2b2dd24SShri Abhyankar x1 = x[jdx];x2 = x[1+jdx]; 4425b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4426b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4427b2b2dd24SShri Abhyankar v += 4; 4428b2b2dd24SShri Abhyankar } 4429b2b2dd24SShri Abhyankar x[idx] = s1; 4430b2b2dd24SShri Abhyankar x[1+idx] = s2; 4431b2b2dd24SShri Abhyankar } 4432b2b2dd24SShri Abhyankar 4433b2b2dd24SShri Abhyankar /* backward solve the upper triangular */ 4434b2b2dd24SShri Abhyankar for (i=n-1; i>=0; i--){ 4435b2b2dd24SShri Abhyankar v = aa + 4*(adiag[i+1]+1); 4436b2b2dd24SShri Abhyankar vi = aj + adiag[i+1]+1; 4437b2b2dd24SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 4438b2b2dd24SShri Abhyankar idt = 2*i; 4439b2b2dd24SShri Abhyankar s1 = x[idt]; s2 = x[1+idt]; 4440b2b2dd24SShri Abhyankar for(k=0;k<nz;k++){ 4441b2b2dd24SShri Abhyankar idx = 2*vi[k]; 4442b2b2dd24SShri Abhyankar x1 = x[idx]; x2 = x[1+idx]; 4443b2b2dd24SShri Abhyankar s1 -= v[0]*x1 + v[2]*x2; 4444b2b2dd24SShri Abhyankar s2 -= v[1]*x1 + v[3]*x2; 4445b2b2dd24SShri Abhyankar v += 4; 4446b2b2dd24SShri Abhyankar } 4447b2b2dd24SShri Abhyankar /* x = inv_diagonal*x */ 4448b2b2dd24SShri Abhyankar x[idt] = v[0]*s1 + v[2]*s2; 4449b2b2dd24SShri Abhyankar x[1+idt] = v[1]*s1 + v[3]*s2; 4450b2b2dd24SShri Abhyankar } 4451b2b2dd24SShri Abhyankar 4452b2b2dd24SShri Abhyankar ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4453b2b2dd24SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4454b2b2dd24SShri Abhyankar ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4455b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 4456b2b2dd24SShri Abhyankar } 4457b2b2dd24SShri Abhyankar 4458b2b2dd24SShri Abhyankar #undef __FUNCT__ 44594a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4460dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 44614e2b4712SSatish Balay { 44624e2b4712SSatish Balay Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 44634e2b4712SSatish Balay IS iscol=a->col,isrow=a->row; 44646849ba73SBarry Smith PetscErrorCode ierr; 44655d0c19d7SBarry Smith PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 44665d0c19d7SBarry Smith const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 44673f1db9ecSBarry Smith MatScalar *aa=a->a,*v; 446887828ca2SBarry Smith PetscScalar *x,*b,s1,*t; 44694e2b4712SSatish Balay 44704e2b4712SSatish Balay PetscFunctionBegin; 44714e2b4712SSatish Balay if (!n) PetscFunctionReturn(0); 44724e2b4712SSatish Balay 44731ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 44741ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4475f1af5d2fSBarry Smith t = a->solve_work; 44764e2b4712SSatish Balay 44774e2b4712SSatish Balay ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 44784e2b4712SSatish Balay ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 44794e2b4712SSatish Balay 44804e2b4712SSatish Balay /* forward solve the lower triangular */ 4481f1af5d2fSBarry Smith t[0] = b[*r++]; 44824e2b4712SSatish Balay for (i=1; i<n; i++) { 44834e2b4712SSatish Balay v = aa + ai[i]; 44844e2b4712SSatish Balay vi = aj + ai[i]; 44854e2b4712SSatish Balay nz = diag[i] - ai[i]; 4486f1af5d2fSBarry Smith s1 = b[*r++]; 44874e2b4712SSatish Balay while (nz--) { 4488f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 44894e2b4712SSatish Balay } 4490f1af5d2fSBarry Smith t[i] = s1; 44914e2b4712SSatish Balay } 44924e2b4712SSatish Balay /* backward solve the upper triangular */ 44934e2b4712SSatish Balay for (i=n-1; i>=0; i--){ 44944e2b4712SSatish Balay v = aa + diag[i] + 1; 44954e2b4712SSatish Balay vi = aj + diag[i] + 1; 44964e2b4712SSatish Balay nz = ai[i+1] - diag[i] - 1; 4497f1af5d2fSBarry Smith s1 = t[i]; 44984e2b4712SSatish Balay while (nz--) { 4499f1af5d2fSBarry Smith s1 -= (*v++)*t[*vi++]; 45004e2b4712SSatish Balay } 4501f1af5d2fSBarry Smith x[*c--] = t[i] = aa[diag[i]]*s1; 45024e2b4712SSatish Balay } 45034e2b4712SSatish Balay 45044e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 45054e2b4712SSatish Balay ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 45061ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 45071ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4508dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 45094e2b4712SSatish Balay PetscFunctionReturn(0); 45104e2b4712SSatish Balay } 451115091d37SBarry Smith /* 451215091d37SBarry Smith Special case where the matrix was ILU(0) factored in the natural 451315091d37SBarry Smith ordering. This eliminates the need for the column and row permutation. 451415091d37SBarry Smith */ 45154a2ae208SSatish Balay #undef __FUNCT__ 45164a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4517dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 451815091d37SBarry Smith { 451915091d37SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4520690b6cddSBarry Smith PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4521dfbe8321SBarry Smith PetscErrorCode ierr; 4522690b6cddSBarry Smith PetscInt *diag = a->diag; 452315091d37SBarry Smith MatScalar *aa=a->a; 452487828ca2SBarry Smith PetscScalar *x,*b; 452587828ca2SBarry Smith PetscScalar s1,x1; 452615091d37SBarry Smith MatScalar *v; 4527690b6cddSBarry Smith PetscInt jdx,idt,idx,nz,*vi,i; 452815091d37SBarry Smith 452915091d37SBarry Smith PetscFunctionBegin; 45301ebc52fbSHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 45311ebc52fbSHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 453215091d37SBarry Smith 453315091d37SBarry Smith /* forward solve the lower triangular */ 453415091d37SBarry Smith idx = 0; 453515091d37SBarry Smith x[0] = b[0]; 453615091d37SBarry Smith for (i=1; i<n; i++) { 453715091d37SBarry Smith v = aa + ai[i]; 453815091d37SBarry Smith vi = aj + ai[i]; 453915091d37SBarry Smith nz = diag[i] - ai[i]; 454015091d37SBarry Smith idx += 1; 4541f1af5d2fSBarry Smith s1 = b[idx]; 454215091d37SBarry Smith while (nz--) { 454315091d37SBarry Smith jdx = *vi++; 454415091d37SBarry Smith x1 = x[jdx]; 4545f1af5d2fSBarry Smith s1 -= v[0]*x1; 454615091d37SBarry Smith v += 1; 454715091d37SBarry Smith } 4548f1af5d2fSBarry Smith x[idx] = s1; 454915091d37SBarry Smith } 455015091d37SBarry Smith /* backward solve the upper triangular */ 455115091d37SBarry Smith for (i=n-1; i>=0; i--){ 455215091d37SBarry Smith v = aa + diag[i] + 1; 455315091d37SBarry Smith vi = aj + diag[i] + 1; 455415091d37SBarry Smith nz = ai[i+1] - diag[i] - 1; 455515091d37SBarry Smith idt = i; 4556f1af5d2fSBarry Smith s1 = x[idt]; 455715091d37SBarry Smith while (nz--) { 455815091d37SBarry Smith idx = *vi++; 455915091d37SBarry Smith x1 = x[idx]; 4560f1af5d2fSBarry Smith s1 -= v[0]*x1; 456115091d37SBarry Smith v += 1; 456215091d37SBarry Smith } 456315091d37SBarry Smith v = aa + diag[i]; 4564f1af5d2fSBarry Smith x[idt] = v[0]*s1; 456515091d37SBarry Smith } 45661ebc52fbSHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 45671ebc52fbSHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4568dc0b31edSSatish Balay ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 456915091d37SBarry Smith PetscFunctionReturn(0); 457015091d37SBarry Smith } 45714e2b4712SSatish Balay 45724e2b4712SSatish Balay /* ----------------------------------------------------------------*/ 457316a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 45746bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 4575ae3d28f0SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth); 45766bce7ff8SHong Zhang 45776bce7ff8SHong Zhang #undef __FUNCT__ 45786bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 45796bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 45806bce7ff8SHong Zhang { 45816bce7ff8SHong Zhang Mat C=B; 45826bce7ff8SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 45836bce7ff8SHong Zhang IS isrow = b->row,isicol = b->icol; 45846bce7ff8SHong Zhang PetscErrorCode ierr; 45856bce7ff8SHong Zhang const PetscInt *r,*ic,*ics; 45866bce7ff8SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 45876bce7ff8SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4588b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4589914a18a2SHong Zhang PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4590914a18a2SHong Zhang MatScalar *v_work; 4591ae3d28f0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 45926bce7ff8SHong Zhang 45936bce7ff8SHong Zhang PetscFunctionBegin; 45946bce7ff8SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 45956bce7ff8SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4596ae3d28f0SHong Zhang 4597fca92195SBarry Smith ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 4598fca92195SBarry Smith ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 45996bce7ff8SHong Zhang ics = ic; 46006bce7ff8SHong Zhang 4601914a18a2SHong Zhang /* generate work space needed by dense LU factorization */ 4602fca92195SBarry Smith ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 4603914a18a2SHong Zhang 46046bce7ff8SHong Zhang for (i=0; i<n; i++){ 46056bce7ff8SHong Zhang /* zero rtmp */ 46066bce7ff8SHong Zhang /* L part */ 46076bce7ff8SHong Zhang nz = bi[i+1] - bi[i]; 46086bce7ff8SHong Zhang bjtmp = bj + bi[i]; 4609914a18a2SHong Zhang for (j=0; j<nz; j++){ 4610914a18a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4611914a18a2SHong Zhang } 46126bce7ff8SHong Zhang 46136bce7ff8SHong Zhang /* U part */ 46141a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 46151a83e813SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 46161a83e813SShri Abhyankar for (j=0; j<nz; j++){ 46171a83e813SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 46181a83e813SShri Abhyankar } 46191a83e813SShri Abhyankar 46201a83e813SShri Abhyankar /* load in initial (unfactored row) */ 46211a83e813SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 46221a83e813SShri Abhyankar ajtmp = aj + ai[r[i]]; 46231a83e813SShri Abhyankar v = aa + bs2*ai[r[i]]; 46241a83e813SShri Abhyankar for (j=0; j<nz; j++) { 46251a83e813SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 46261a83e813SShri Abhyankar } 46271a83e813SShri Abhyankar 46281a83e813SShri Abhyankar /* elimination */ 46291a83e813SShri Abhyankar bjtmp = bj + bi[i]; 46301a83e813SShri Abhyankar nzL = bi[i+1] - bi[i]; 46311a83e813SShri Abhyankar for(k=0;k < nzL;k++) { 46321a83e813SShri Abhyankar row = bjtmp[k]; 46331a83e813SShri Abhyankar pc = rtmp + bs2*row; 46341a83e813SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 46351a83e813SShri Abhyankar if (flg) { 46361a83e813SShri Abhyankar pv = b->a + bs2*bdiag[row]; 46371a83e813SShri Abhyankar Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 46381a83e813SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 46391a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 46401a83e813SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 46411a83e813SShri Abhyankar for (j=0; j<nz; j++) { 46421a83e813SShri Abhyankar Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 46431a83e813SShri Abhyankar } 46441a83e813SShri Abhyankar ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 46451a83e813SShri Abhyankar } 46461a83e813SShri Abhyankar } 46471a83e813SShri Abhyankar 46481a83e813SShri Abhyankar /* finished row so stick it into b->a */ 46491a83e813SShri Abhyankar /* L part */ 46501a83e813SShri Abhyankar pv = b->a + bs2*bi[i] ; 46511a83e813SShri Abhyankar pj = b->j + bi[i] ; 46521a83e813SShri Abhyankar nz = bi[i+1] - bi[i]; 46531a83e813SShri Abhyankar for (j=0; j<nz; j++) { 46541a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 46551a83e813SShri Abhyankar } 46561a83e813SShri Abhyankar 46571a83e813SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 46581a83e813SShri Abhyankar pv = b->a + bs2*bdiag[i]; 46591a83e813SShri Abhyankar pj = b->j + bdiag[i]; 46601a83e813SShri Abhyankar /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 46611a83e813SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 46621a83e813SShri Abhyankar ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 46631a83e813SShri Abhyankar 46641a83e813SShri Abhyankar /* U part */ 46651a83e813SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 46661a83e813SShri Abhyankar pj = b->j + bdiag[i+1]+1; 46671a83e813SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 46681a83e813SShri Abhyankar for (j=0; j<nz; j++){ 46691a83e813SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 46701a83e813SShri Abhyankar } 46711a83e813SShri Abhyankar } 46721a83e813SShri Abhyankar 46731a83e813SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 4674fca92195SBarry Smith ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 46751a83e813SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 46761a83e813SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 46771a83e813SShri Abhyankar 4678ae3d28f0SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4679ae3d28f0SHong Zhang ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 4680ae3d28f0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 4681ae3d28f0SHong Zhang if (both_identity){ 4682a2d6a19aSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 4683ae3d28f0SHong Zhang } else { 4684a2d6a19aSShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 4685ae3d28f0SHong Zhang } 4686*8499736aSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N_newdatastruct; 4687ae3d28f0SHong Zhang 46881a83e813SShri Abhyankar C->assembled = PETSC_TRUE; 46891a83e813SShri Abhyankar ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 46901a83e813SShri Abhyankar PetscFunctionReturn(0); 46911a83e813SShri Abhyankar } 46921a83e813SShri Abhyankar 46936bce7ff8SHong Zhang /* 46946bce7ff8SHong Zhang ilu(0) with natural ordering under new data structure. 469516a2bf60SHong Zhang See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 469616a2bf60SHong Zhang because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 46976bce7ff8SHong Zhang */ 4698c0c7eb62SShri Abhyankar 46996bce7ff8SHong Zhang #undef __FUNCT__ 47006bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 47016bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 47026bce7ff8SHong Zhang { 47036bce7ff8SHong Zhang 47046bce7ff8SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 47056bce7ff8SHong Zhang PetscErrorCode ierr; 470616a2bf60SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 470735aa4fcfSShri Abhyankar PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 470835aa4fcfSShri Abhyankar 470935aa4fcfSShri Abhyankar PetscFunctionBegin; 471035aa4fcfSShri Abhyankar ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 471135aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 471235aa4fcfSShri Abhyankar 471335aa4fcfSShri Abhyankar /* allocate matrix arrays for new data structure */ 471435aa4fcfSShri Abhyankar ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 471535aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 471635aa4fcfSShri Abhyankar b->singlemalloc = PETSC_TRUE; 471735aa4fcfSShri Abhyankar if (!b->diag){ 471835aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 471935aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 472035aa4fcfSShri Abhyankar } 472135aa4fcfSShri Abhyankar bdiag = b->diag; 472235aa4fcfSShri Abhyankar 472335aa4fcfSShri Abhyankar if (n > 0) { 472435aa4fcfSShri Abhyankar ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 472535aa4fcfSShri Abhyankar } 472635aa4fcfSShri Abhyankar 472735aa4fcfSShri Abhyankar /* set bi and bj with new data structure */ 472835aa4fcfSShri Abhyankar bi = b->i; 472935aa4fcfSShri Abhyankar bj = b->j; 473035aa4fcfSShri Abhyankar 473135aa4fcfSShri Abhyankar /* L part */ 473235aa4fcfSShri Abhyankar bi[0] = 0; 473335aa4fcfSShri Abhyankar for (i=0; i<n; i++){ 473435aa4fcfSShri Abhyankar nz = adiag[i] - ai[i]; 473535aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nz; 473635aa4fcfSShri Abhyankar aj = a->j + ai[i]; 473735aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 473835aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 473935aa4fcfSShri Abhyankar } 474035aa4fcfSShri Abhyankar } 474135aa4fcfSShri Abhyankar 474235aa4fcfSShri Abhyankar /* U part */ 474335aa4fcfSShri Abhyankar bi_temp = bi[n]; 474435aa4fcfSShri Abhyankar bdiag[n] = bi[n]-1; 474535aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 474635aa4fcfSShri Abhyankar nz = ai[i+1] - adiag[i] - 1; 474735aa4fcfSShri Abhyankar bi_temp = bi_temp + nz + 1; 474835aa4fcfSShri Abhyankar aj = a->j + adiag[i] + 1; 474935aa4fcfSShri Abhyankar for (j=0; j<nz; j++){ 475035aa4fcfSShri Abhyankar *bj = aj[j]; bj++; 475135aa4fcfSShri Abhyankar } 475235aa4fcfSShri Abhyankar /* diag[i] */ 475335aa4fcfSShri Abhyankar *bj = i; bj++; 475435aa4fcfSShri Abhyankar bdiag[i] = bi_temp - 1; 475535aa4fcfSShri Abhyankar } 475635aa4fcfSShri Abhyankar PetscFunctionReturn(0); 475735aa4fcfSShri Abhyankar } 475835aa4fcfSShri Abhyankar 475935aa4fcfSShri Abhyankar #undef __FUNCT__ 476016a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 476116a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 476216a2bf60SHong Zhang { 476316a2bf60SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 476416a2bf60SHong Zhang IS isicol; 476516a2bf60SHong Zhang PetscErrorCode ierr; 476616a2bf60SHong Zhang const PetscInt *r,*ic; 47677fa3a6a0SHong Zhang PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 476816a2bf60SHong Zhang PetscInt *bi,*cols,nnz,*cols_lvl; 476916a2bf60SHong Zhang PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 477016a2bf60SHong Zhang PetscInt i,levels,diagonal_fill; 47717fa3a6a0SHong Zhang PetscTruth col_identity,row_identity,both_identity; 477216a2bf60SHong Zhang PetscReal f; 477316a2bf60SHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 477416a2bf60SHong Zhang PetscBT lnkbt; 477516a2bf60SHong Zhang PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 477616a2bf60SHong Zhang PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 477716a2bf60SHong Zhang PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 477816a2bf60SHong Zhang PetscTruth missing; 47797fa3a6a0SHong Zhang PetscInt bs=A->rmap->bs,bs2=a->bs2; 478016a2bf60SHong Zhang 478116a2bf60SHong Zhang PetscFunctionBegin; 478216a2bf60SHong Zhang if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 478316a2bf60SHong Zhang ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 478416a2bf60SHong Zhang if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 478516a2bf60SHong Zhang 478616a2bf60SHong Zhang f = info->fill; 478716a2bf60SHong Zhang levels = (PetscInt)info->levels; 478816a2bf60SHong Zhang diagonal_fill = (PetscInt)info->diagonal_fill; 478916a2bf60SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 479016a2bf60SHong Zhang 479116a2bf60SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 479216a2bf60SHong Zhang ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 47937fa3a6a0SHong Zhang both_identity = (PetscTruth) (row_identity && col_identity); 479416a2bf60SHong Zhang 47957fa3a6a0SHong Zhang if (!levels && both_identity) { 479616a2bf60SHong Zhang /* special case: ilu(0) with natural ordering */ 479716a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 4798ae3d28f0SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 479935aa4fcfSShri Abhyankar 480035aa4fcfSShri Abhyankar fact->factor = MAT_FACTOR_ILU; 480135aa4fcfSShri Abhyankar (fact)->info.factor_mallocs = 0; 480235aa4fcfSShri Abhyankar (fact)->info.fill_ratio_given = info->fill; 480335aa4fcfSShri Abhyankar (fact)->info.fill_ratio_needed = 1.0; 480435aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 480535aa4fcfSShri Abhyankar b->row = isrow; 480635aa4fcfSShri Abhyankar b->col = iscol; 480735aa4fcfSShri Abhyankar b->icol = isicol; 480835aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 480935aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 481035aa4fcfSShri Abhyankar b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 481135aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 481235aa4fcfSShri Abhyankar PetscFunctionReturn(0); 481335aa4fcfSShri Abhyankar } 481435aa4fcfSShri Abhyankar 481535aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 481635aa4fcfSShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 481735aa4fcfSShri Abhyankar 481835aa4fcfSShri Abhyankar /* get new row pointers */ 481935aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 482035aa4fcfSShri Abhyankar bi[0] = 0; 482135aa4fcfSShri Abhyankar /* bdiag is location of diagonal in factor */ 482235aa4fcfSShri Abhyankar ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 482335aa4fcfSShri Abhyankar bdiag[0] = 0; 482435aa4fcfSShri Abhyankar 4825fca92195SBarry Smith ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 482635aa4fcfSShri Abhyankar 482735aa4fcfSShri Abhyankar /* create a linked list for storing column indices of the active row */ 482835aa4fcfSShri Abhyankar nlnk = n + 1; 482935aa4fcfSShri Abhyankar ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 483035aa4fcfSShri Abhyankar 483135aa4fcfSShri Abhyankar /* initial FreeSpace size is f*(ai[n]+1) */ 483235aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 483335aa4fcfSShri Abhyankar current_space = free_space; 483435aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 483535aa4fcfSShri Abhyankar current_space_lvl = free_space_lvl; 483635aa4fcfSShri Abhyankar 483735aa4fcfSShri Abhyankar for (i=0; i<n; i++) { 483835aa4fcfSShri Abhyankar nzi = 0; 483935aa4fcfSShri Abhyankar /* copy current row into linked list */ 484035aa4fcfSShri Abhyankar nnz = ai[r[i]+1] - ai[r[i]]; 484135aa4fcfSShri Abhyankar if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 484235aa4fcfSShri Abhyankar cols = aj + ai[r[i]]; 484335aa4fcfSShri Abhyankar lnk[i] = -1; /* marker to indicate if diagonal exists */ 484435aa4fcfSShri Abhyankar ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 484535aa4fcfSShri Abhyankar nzi += nlnk; 484635aa4fcfSShri Abhyankar 484735aa4fcfSShri Abhyankar /* make sure diagonal entry is included */ 484835aa4fcfSShri Abhyankar if (diagonal_fill && lnk[i] == -1) { 484935aa4fcfSShri Abhyankar fm = n; 485035aa4fcfSShri Abhyankar while (lnk[fm] < i) fm = lnk[fm]; 485135aa4fcfSShri Abhyankar lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 485235aa4fcfSShri Abhyankar lnk[fm] = i; 485335aa4fcfSShri Abhyankar lnk_lvl[i] = 0; 485435aa4fcfSShri Abhyankar nzi++; dcount++; 485535aa4fcfSShri Abhyankar } 485635aa4fcfSShri Abhyankar 485735aa4fcfSShri Abhyankar /* add pivot rows into the active row */ 485835aa4fcfSShri Abhyankar nzbd = 0; 485935aa4fcfSShri Abhyankar prow = lnk[n]; 486035aa4fcfSShri Abhyankar while (prow < i) { 486135aa4fcfSShri Abhyankar nnz = bdiag[prow]; 486235aa4fcfSShri Abhyankar cols = bj_ptr[prow] + nnz + 1; 486335aa4fcfSShri Abhyankar cols_lvl = bjlvl_ptr[prow] + nnz + 1; 486435aa4fcfSShri Abhyankar nnz = bi[prow+1] - bi[prow] - nnz - 1; 486535aa4fcfSShri Abhyankar ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 486635aa4fcfSShri Abhyankar nzi += nlnk; 486735aa4fcfSShri Abhyankar prow = lnk[prow]; 486835aa4fcfSShri Abhyankar nzbd++; 486935aa4fcfSShri Abhyankar } 487035aa4fcfSShri Abhyankar bdiag[i] = nzbd; 487135aa4fcfSShri Abhyankar bi[i+1] = bi[i] + nzi; 487235aa4fcfSShri Abhyankar 487335aa4fcfSShri Abhyankar /* if free space is not available, make more free space */ 487435aa4fcfSShri Abhyankar if (current_space->local_remaining<nzi) { 487535aa4fcfSShri Abhyankar nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 487635aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 487735aa4fcfSShri Abhyankar ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 487835aa4fcfSShri Abhyankar reallocs++; 487935aa4fcfSShri Abhyankar } 488035aa4fcfSShri Abhyankar 488135aa4fcfSShri Abhyankar /* copy data into free_space and free_space_lvl, then initialize lnk */ 488235aa4fcfSShri Abhyankar ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 488335aa4fcfSShri Abhyankar bj_ptr[i] = current_space->array; 488435aa4fcfSShri Abhyankar bjlvl_ptr[i] = current_space_lvl->array; 488535aa4fcfSShri Abhyankar 488635aa4fcfSShri Abhyankar /* make sure the active row i has diagonal entry */ 488735aa4fcfSShri Abhyankar if (*(bj_ptr[i]+bdiag[i]) != i) { 488835aa4fcfSShri Abhyankar SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 488935aa4fcfSShri Abhyankar try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 489035aa4fcfSShri Abhyankar } 489135aa4fcfSShri Abhyankar 489235aa4fcfSShri Abhyankar current_space->array += nzi; 489335aa4fcfSShri Abhyankar current_space->local_used += nzi; 489435aa4fcfSShri Abhyankar current_space->local_remaining -= nzi; 489535aa4fcfSShri Abhyankar current_space_lvl->array += nzi; 489635aa4fcfSShri Abhyankar current_space_lvl->local_used += nzi; 489735aa4fcfSShri Abhyankar current_space_lvl->local_remaining -= nzi; 489835aa4fcfSShri Abhyankar } 489935aa4fcfSShri Abhyankar 490035aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 490135aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 490235aa4fcfSShri Abhyankar 490335aa4fcfSShri Abhyankar /* destroy list of free space and other temporary arrays */ 490435aa4fcfSShri Abhyankar ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 490535aa4fcfSShri Abhyankar 490635aa4fcfSShri Abhyankar /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 490735aa4fcfSShri Abhyankar ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 490835aa4fcfSShri Abhyankar 490935aa4fcfSShri Abhyankar ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 491035aa4fcfSShri Abhyankar ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 4911fca92195SBarry Smith ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 491235aa4fcfSShri Abhyankar 491335aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO) 491435aa4fcfSShri Abhyankar { 491535aa4fcfSShri Abhyankar PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 491635aa4fcfSShri Abhyankar ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 491735aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 491835aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 491935aa4fcfSShri Abhyankar ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 492035aa4fcfSShri Abhyankar if (diagonal_fill) { 492135aa4fcfSShri Abhyankar ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 492235aa4fcfSShri Abhyankar } 492335aa4fcfSShri Abhyankar } 492435aa4fcfSShri Abhyankar #endif 492535aa4fcfSShri Abhyankar 492635aa4fcfSShri Abhyankar /* put together the new matrix */ 492735aa4fcfSShri Abhyankar ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 492835aa4fcfSShri Abhyankar ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 492935aa4fcfSShri Abhyankar b = (Mat_SeqBAIJ*)(fact)->data; 493035aa4fcfSShri Abhyankar b->free_a = PETSC_TRUE; 493135aa4fcfSShri Abhyankar b->free_ij = PETSC_TRUE; 493235aa4fcfSShri Abhyankar b->singlemalloc = PETSC_FALSE; 493335aa4fcfSShri Abhyankar ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 493435aa4fcfSShri Abhyankar b->j = bj; 493535aa4fcfSShri Abhyankar b->i = bi; 493635aa4fcfSShri Abhyankar b->diag = bdiag; 493735aa4fcfSShri Abhyankar b->free_diag = PETSC_TRUE; 493835aa4fcfSShri Abhyankar b->ilen = 0; 493935aa4fcfSShri Abhyankar b->imax = 0; 494035aa4fcfSShri Abhyankar b->row = isrow; 494135aa4fcfSShri Abhyankar b->col = iscol; 494235aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 494335aa4fcfSShri Abhyankar ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 494435aa4fcfSShri Abhyankar b->icol = isicol; 494535aa4fcfSShri Abhyankar ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 494635aa4fcfSShri Abhyankar /* In b structure: Free imax, ilen, old a, old j. 494735aa4fcfSShri Abhyankar Allocate bdiag, solve_work, new a, new j */ 494835aa4fcfSShri Abhyankar ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 494935aa4fcfSShri Abhyankar b->maxnz = b->nz = bdiag[0]+1; 4950ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocs; 4951ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 4952ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 4953ae3d28f0SHong Zhang ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 495435aa4fcfSShri Abhyankar PetscFunctionReturn(0); 495535aa4fcfSShri Abhyankar } 495635aa4fcfSShri Abhyankar 495735aa4fcfSShri Abhyankar 49584e2b4712SSatish Balay /* 49594e2b4712SSatish Balay This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 49604e2b4712SSatish Balay except that the data structure of Mat_SeqAIJ is slightly different. 49614e2b4712SSatish Balay Not a good example of code reuse. 49624e2b4712SSatish Balay */ 49634a2ae208SSatish Balay #undef __FUNCT__ 49644a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 49650481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 49664e2b4712SSatish Balay { 49674e2b4712SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 49684e2b4712SSatish Balay IS isicol; 49696849ba73SBarry Smith PetscErrorCode ierr; 49705d0c19d7SBarry Smith const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 49715d0c19d7SBarry Smith PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 4972a96a251dSBarry Smith PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 4973d0f46423SBarry Smith PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 497441df41f0SMatthew Knepley PetscTruth col_identity,row_identity,both_identity,flg; 4975329f5518SBarry Smith PetscReal f; 4976c0c7eb62SShri Abhyankar PetscTruth newdatastruct = PETSC_FALSE; 49774e2b4712SSatish Balay 49784e2b4712SSatish Balay PetscFunctionBegin; 497916a2bf60SHong Zhang ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 498016a2bf60SHong Zhang if (newdatastruct){ 498116a2bf60SHong Zhang ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 498216a2bf60SHong Zhang PetscFunctionReturn(0); 498316a2bf60SHong Zhang } 498416a2bf60SHong Zhang 49856bce7ff8SHong Zhang ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 49866bce7ff8SHong Zhang if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 49876bce7ff8SHong Zhang 4988435faa5fSBarry Smith f = info->fill; 4989690b6cddSBarry Smith levels = (PetscInt)info->levels; 4990690b6cddSBarry Smith diagonal_fill = (PetscInt)info->diagonal_fill; 49914c49b128SBarry Smith ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 499216a2bf60SHong Zhang 4993667159a5SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4994667159a5SBarry Smith ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 49957d18ce8fSMatthew Knepley both_identity = (PetscTruth) (row_identity && col_identity); 4996309c388cSBarry Smith 499741df41f0SMatthew Knepley if (!levels && both_identity) { /* special case copy the nonzero structure */ 499816a2bf60SHong Zhang ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 49996bce7ff8SHong Zhang ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 50006bce7ff8SHong Zhang 5001719d5645SBarry Smith fact->factor = MAT_FACTOR_ILU; 5002ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 5003bb3d539aSBarry Smith b->row = isrow; 5004bb3d539aSBarry Smith b->col = iscol; 5005bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5006bb3d539aSBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5007bb3d539aSBarry Smith b->icol = isicol; 5008bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5009b588c5a2SHong Zhang ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 50106bce7ff8SHong Zhang PetscFunctionReturn(0); 50116bce7ff8SHong Zhang } 50126bce7ff8SHong Zhang 50136bce7ff8SHong Zhang /* general case perform the symbolic factorization */ 50144e2b4712SSatish Balay ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 50154e2b4712SSatish Balay ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 50164e2b4712SSatish Balay 50174e2b4712SSatish Balay /* get new row pointers */ 5018690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 50194e2b4712SSatish Balay ainew[0] = 0; 50204e2b4712SSatish Balay /* don't know how many column pointers are needed so estimate */ 5021690b6cddSBarry Smith jmax = (PetscInt)(f*ai[n] + 1); 5022690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 50234e2b4712SSatish Balay /* ajfill is level of fill for each fill entry */ 5024690b6cddSBarry Smith ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 50254e2b4712SSatish Balay /* fill is a linked list of nonzeros in active row */ 5026690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 50274e2b4712SSatish Balay /* im is level for each filled value */ 5028690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 50294e2b4712SSatish Balay /* dloc is location of diagonal in factor */ 5030690b6cddSBarry Smith ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 50314e2b4712SSatish Balay dloc[0] = 0; 50324e2b4712SSatish Balay for (prow=0; prow<n; prow++) { 5033435faa5fSBarry Smith 5034435faa5fSBarry Smith /* copy prow into linked list */ 50354e2b4712SSatish Balay nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 50363b4a8b6dSBarry Smith if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 50374e2b4712SSatish Balay xi = aj + ai[r[prow]]; 50384e2b4712SSatish Balay fill[n] = n; 5039435faa5fSBarry Smith fill[prow] = -1; /* marker for diagonal entry */ 50404e2b4712SSatish Balay while (nz--) { 50414e2b4712SSatish Balay fm = n; 50424e2b4712SSatish Balay idx = ic[*xi++]; 50434e2b4712SSatish Balay do { 50444e2b4712SSatish Balay m = fm; 50454e2b4712SSatish Balay fm = fill[m]; 50464e2b4712SSatish Balay } while (fm < idx); 50474e2b4712SSatish Balay fill[m] = idx; 50484e2b4712SSatish Balay fill[idx] = fm; 50494e2b4712SSatish Balay im[idx] = 0; 50504e2b4712SSatish Balay } 5051435faa5fSBarry Smith 5052435faa5fSBarry Smith /* make sure diagonal entry is included */ 5053435faa5fSBarry Smith if (diagonal_fill && fill[prow] == -1) { 5054435faa5fSBarry Smith fm = n; 5055435faa5fSBarry Smith while (fill[fm] < prow) fm = fill[fm]; 5056435faa5fSBarry Smith fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5057435faa5fSBarry Smith fill[fm] = prow; 5058435faa5fSBarry Smith im[prow] = 0; 5059435faa5fSBarry Smith nzf++; 5060335d9088SBarry Smith dcount++; 5061435faa5fSBarry Smith } 5062435faa5fSBarry Smith 50634e2b4712SSatish Balay nzi = 0; 50644e2b4712SSatish Balay row = fill[n]; 50654e2b4712SSatish Balay while (row < prow) { 50664e2b4712SSatish Balay incrlev = im[row] + 1; 50674e2b4712SSatish Balay nz = dloc[row]; 5068435faa5fSBarry Smith xi = ajnew + ainew[row] + nz + 1; 50694e2b4712SSatish Balay flev = ajfill + ainew[row] + nz + 1; 50704e2b4712SSatish Balay nnz = ainew[row+1] - ainew[row] - nz - 1; 50714e2b4712SSatish Balay fm = row; 50724e2b4712SSatish Balay while (nnz-- > 0) { 50734e2b4712SSatish Balay idx = *xi++; 50744e2b4712SSatish Balay if (*flev + incrlev > levels) { 50754e2b4712SSatish Balay flev++; 50764e2b4712SSatish Balay continue; 50774e2b4712SSatish Balay } 50784e2b4712SSatish Balay do { 50794e2b4712SSatish Balay m = fm; 50804e2b4712SSatish Balay fm = fill[m]; 50814e2b4712SSatish Balay } while (fm < idx); 50824e2b4712SSatish Balay if (fm != idx) { 50834e2b4712SSatish Balay im[idx] = *flev + incrlev; 50844e2b4712SSatish Balay fill[m] = idx; 50854e2b4712SSatish Balay fill[idx] = fm; 50864e2b4712SSatish Balay fm = idx; 50874e2b4712SSatish Balay nzf++; 5088ecf371e4SBarry Smith } else { 50894e2b4712SSatish Balay if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 50904e2b4712SSatish Balay } 50914e2b4712SSatish Balay flev++; 50924e2b4712SSatish Balay } 50934e2b4712SSatish Balay row = fill[row]; 50944e2b4712SSatish Balay nzi++; 50954e2b4712SSatish Balay } 50964e2b4712SSatish Balay /* copy new filled row into permanent storage */ 50974e2b4712SSatish Balay ainew[prow+1] = ainew[prow] + nzf; 50984e2b4712SSatish Balay if (ainew[prow+1] > jmax) { 5099ecf371e4SBarry Smith 5100ecf371e4SBarry Smith /* estimate how much additional space we will need */ 5101ecf371e4SBarry Smith /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5102ecf371e4SBarry Smith /* just double the memory each time */ 5103690b6cddSBarry Smith PetscInt maxadd = jmax; 5104ecf371e4SBarry Smith /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 51054e2b4712SSatish Balay if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 51064e2b4712SSatish Balay jmax += maxadd; 5107ecf371e4SBarry Smith 5108ecf371e4SBarry Smith /* allocate a longer ajnew and ajfill */ 51095d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 51105d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5111606d414cSSatish Balay ierr = PetscFree(ajnew);CHKERRQ(ierr); 51125d0c19d7SBarry Smith ajnew = xitmp; 51135d0c19d7SBarry Smith ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 51145d0c19d7SBarry Smith ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5115606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 51165d0c19d7SBarry Smith ajfill = xitmp; 5117eb150c5cSKris Buschelman reallocate++; /* count how many reallocations are needed */ 51184e2b4712SSatish Balay } 51195d0c19d7SBarry Smith xitmp = ajnew + ainew[prow]; 51204e2b4712SSatish Balay flev = ajfill + ainew[prow]; 51214e2b4712SSatish Balay dloc[prow] = nzi; 51224e2b4712SSatish Balay fm = fill[n]; 51234e2b4712SSatish Balay while (nzf--) { 51245d0c19d7SBarry Smith *xitmp++ = fm; 51254e2b4712SSatish Balay *flev++ = im[fm]; 51264e2b4712SSatish Balay fm = fill[fm]; 51274e2b4712SSatish Balay } 5128435faa5fSBarry Smith /* make sure row has diagonal entry */ 5129435faa5fSBarry Smith if (ajnew[ainew[prow]+dloc[prow]] != prow) { 513077431f27SBarry Smith SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 51312401956bSBarry Smith try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5132435faa5fSBarry Smith } 51334e2b4712SSatish Balay } 5134606d414cSSatish Balay ierr = PetscFree(ajfill);CHKERRQ(ierr); 51354e2b4712SSatish Balay ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 51364e2b4712SSatish Balay ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5137606d414cSSatish Balay ierr = PetscFree(fill);CHKERRQ(ierr); 5138606d414cSSatish Balay ierr = PetscFree(im);CHKERRQ(ierr); 51394e2b4712SSatish Balay 51406cf91177SBarry Smith #if defined(PETSC_USE_INFO) 51414e2b4712SSatish Balay { 5142329f5518SBarry Smith PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5143ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5144ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5145ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5146ae15b995SBarry Smith ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5147335d9088SBarry Smith if (diagonal_fill) { 5148ae15b995SBarry Smith ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5149335d9088SBarry Smith } 51504e2b4712SSatish Balay } 515163ba0a88SBarry Smith #endif 51524e2b4712SSatish Balay 51534e2b4712SSatish Balay /* put together the new matrix */ 5154719d5645SBarry Smith ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5155719d5645SBarry Smith ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5156ae3d28f0SHong Zhang b = (Mat_SeqBAIJ*)fact->data; 5157e6b907acSBarry Smith b->free_a = PETSC_TRUE; 5158e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 51597c922b88SBarry Smith b->singlemalloc = PETSC_FALSE; 5160a96a251dSBarry Smith ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 51614e2b4712SSatish Balay b->j = ajnew; 51624e2b4712SSatish Balay b->i = ainew; 51634e2b4712SSatish Balay for (i=0; i<n; i++) dloc[i] += ainew[i]; 51644e2b4712SSatish Balay b->diag = dloc; 51657f53bb6cSHong Zhang b->free_diag = PETSC_TRUE; 51664e2b4712SSatish Balay b->ilen = 0; 51674e2b4712SSatish Balay b->imax = 0; 51684e2b4712SSatish Balay b->row = isrow; 51694e2b4712SSatish Balay b->col = iscol; 5170bcd9e38bSBarry Smith b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5171c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5172c38d4ed2SBarry Smith ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5173e51c0b9cSSatish Balay b->icol = isicol; 517487828ca2SBarry Smith ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 51754e2b4712SSatish Balay /* In b structure: Free imax, ilen, old a, old j. 51764e2b4712SSatish Balay Allocate dloc, solve_work, new a, new j */ 5177719d5645SBarry Smith ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 51784e2b4712SSatish Balay b->maxnz = b->nz = ainew[n]; 51794e2b4712SSatish Balay 5180ae3d28f0SHong Zhang fact->info.factor_mallocs = reallocate; 5181ae3d28f0SHong Zhang fact->info.fill_ratio_given = f; 5182ae3d28f0SHong Zhang fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 51836bce7ff8SHong Zhang 518441df41f0SMatthew Knepley ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 51858661488fSKris Buschelman PetscFunctionReturn(0); 51868661488fSKris Buschelman } 51878661488fSKris Buschelman 5188732ee342SKris Buschelman #undef __FUNCT__ 51897e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5190dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 51917e7071cdSKris Buschelman { 519212272027SHong Zhang /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 519312272027SHong Zhang /* int i,*AJ=a->j,nz=a->nz; */ 51945a9542e3SKris Buschelman PetscFunctionBegin; 51957cf1b8d3SKris Buschelman /* Undo Column scaling */ 51967cf1b8d3SKris Buschelman /* while (nz--) { */ 51977cf1b8d3SKris Buschelman /* AJ[i] = AJ[i]/4; */ 51987cf1b8d3SKris Buschelman /* } */ 5199c115a38dSKris Buschelman /* This should really invoke a push/pop logic, but we don't have that yet. */ 5200c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 52017cf1b8d3SKris Buschelman PetscFunctionReturn(0); 52027cf1b8d3SKris Buschelman } 52037cf1b8d3SKris Buschelman 52047cf1b8d3SKris Buschelman #undef __FUNCT__ 52057cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5206dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 52077cf1b8d3SKris Buschelman { 52087cf1b8d3SKris Buschelman Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5209b24ad042SBarry Smith PetscInt *AJ=a->j,nz=a->nz; 52102aa5897fSKris Buschelman unsigned short *aj=(unsigned short *)AJ; 52115a9542e3SKris Buschelman PetscFunctionBegin; 52120b9da03eSKris Buschelman /* Is this really necessary? */ 521320235379SKris Buschelman while (nz--) { 52140b9da03eSKris Buschelman AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 52157e7071cdSKris Buschelman } 5216c115a38dSKris Buschelman A->ops->setunfactored = PETSC_NULL; 52177e7071cdSKris Buschelman PetscFunctionReturn(0); 52187e7071cdSKris Buschelman } 52197e7071cdSKris Buschelman 5220732ee342SKris Buschelman 5221