1be1d678aSKris Buschelman #define PETSCMAT_DLL 2be1d678aSKris Buschelman 35d517e7eSBarry Smith /* 4ec3a8b7bSBarry Smith Factorization code for BAIJ format. 55d517e7eSBarry Smith */ 67c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h" 7c60f0209SBarry Smith #include "../src/mat/blockinvert.h" 8ec3a8b7bSBarry Smith 96d84be18SBarry Smith /* ------------------------------------------------------------*/ 106d84be18SBarry Smith /* 114eeb42bcSBarry Smith Version for when blocks are 2 by 2 124eeb42bcSBarry Smith */ 13b588c5a2SHong Zhang /* MatLUFactorNumeric_SeqBAIJ_2_newdatastruct - 144c000e2eSHong Zhang copied from MatLUFactorNumeric_SeqBAIJ_N_newdatastruct() and manually re-implemented 15b588c5a2SHong Zhang Kernel_A_gets_A_times_B() 16b588c5a2SHong Zhang Kernel_A_gets_A_minus_B_times_C() 17b588c5a2SHong Zhang Kernel_A_gets_inverse_A() 18b588c5a2SHong Zhang */ 19b588c5a2SHong Zhang #undef __FUNCT__ 20b588c5a2SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_newdatastruct" 21b588c5a2SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 22b588c5a2SHong Zhang { 23b588c5a2SHong Zhang Mat C=B; 24b588c5a2SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 25b588c5a2SHong Zhang IS isrow = b->row,isicol = b->icol; 26b588c5a2SHong Zhang PetscErrorCode ierr; 27b588c5a2SHong Zhang const PetscInt *r,*ic,*ics; 28b588c5a2SHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 29b588c5a2SHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 30b588c5a2SHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 314c000e2eSHong Zhang PetscInt bs2 = a->bs2,flg; 32b588c5a2SHong Zhang PetscReal shift = info->shiftinblocks; 33b588c5a2SHong Zhang 34b588c5a2SHong Zhang PetscFunctionBegin; 35b588c5a2SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 36b588c5a2SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 37b588c5a2SHong Zhang 384c000e2eSHong Zhang /* generate work space needed by the factorization */ 39b588c5a2SHong Zhang ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 40b588c5a2SHong Zhang mwork = rtmp + bs2*n; 414c000e2eSHong Zhang ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 42b588c5a2SHong Zhang ics = ic; 43b588c5a2SHong Zhang 44b588c5a2SHong Zhang for (i=0; i<n; i++){ 45b588c5a2SHong Zhang /* zero rtmp */ 46b588c5a2SHong Zhang /* L part */ 47b588c5a2SHong Zhang nz = bi[i+1] - bi[i]; 48b588c5a2SHong Zhang bjtmp = bj + bi[i]; 49b588c5a2SHong Zhang for (j=0; j<nz; j++){ 50b588c5a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 51b588c5a2SHong Zhang } 52b588c5a2SHong Zhang 53b588c5a2SHong Zhang /* U part */ 54b588c5a2SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i]; 55b588c5a2SHong Zhang bjtmp = bj + bi[2*n-i]; 56b588c5a2SHong Zhang for (j=0; j<nz; j++){ 57b588c5a2SHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 58b588c5a2SHong Zhang } 59b588c5a2SHong Zhang 60b588c5a2SHong Zhang /* load in initial (unfactored row) */ 61b588c5a2SHong Zhang nz = ai[r[i]+1] - ai[r[i]]; 62b588c5a2SHong Zhang ajtmp = aj + ai[r[i]]; 63b588c5a2SHong Zhang v = aa + bs2*ai[r[i]]; 64b588c5a2SHong Zhang for (j=0; j<nz; j++) { 65b588c5a2SHong Zhang ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 66b588c5a2SHong Zhang } 67b588c5a2SHong Zhang 68b588c5a2SHong Zhang /* elimination */ 69b588c5a2SHong Zhang bjtmp = bj + bi[i]; 70b588c5a2SHong Zhang nzL = bi[i+1] - bi[i]; 71b1646270SShri Abhyankar for(k=0;k < nzL;k++) { 72b1646270SShri Abhyankar row = bjtmp[k]; 73b588c5a2SHong Zhang pc = rtmp + bs2*row; 74b588c5a2SHong Zhang for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 75b588c5a2SHong Zhang if (flg) { 76b588c5a2SHong Zhang pv = b->a + bs2*bdiag[row]; 774c000e2eSHong Zhang /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 784c000e2eSHong Zhang ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr); 79b588c5a2SHong Zhang 80b588c5a2SHong Zhang pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 81b588c5a2SHong Zhang pv = b->a + bs2*bi[2*n-row]; 82b588c5a2SHong Zhang nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 83b588c5a2SHong Zhang for (j=0; j<nz; j++) { 84b588c5a2SHong Zhang /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 854c000e2eSHong Zhang /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 864c000e2eSHong Zhang v = rtmp + 4*pj[j]; 874c000e2eSHong Zhang ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr); 88b588c5a2SHong Zhang pv += 4; 89b588c5a2SHong Zhang } 90b588c5a2SHong Zhang ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 91b588c5a2SHong Zhang } 92b588c5a2SHong Zhang } 93b588c5a2SHong Zhang 94b588c5a2SHong Zhang /* finished row so stick it into b->a */ 95b588c5a2SHong Zhang /* L part */ 96b588c5a2SHong Zhang pv = b->a + bs2*bi[i] ; 97b588c5a2SHong Zhang pj = b->j + bi[i] ; 98b588c5a2SHong Zhang nz = bi[i+1] - bi[i]; 99b588c5a2SHong Zhang for (j=0; j<nz; j++) { 100b588c5a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 101b588c5a2SHong Zhang } 102b588c5a2SHong Zhang 103b588c5a2SHong Zhang /* Mark diagonal and invert diagonal for simplier triangular solves */ 104b588c5a2SHong Zhang pv = b->a + bs2*bdiag[i]; 105b588c5a2SHong Zhang pj = b->j + bdiag[i]; 106b588c5a2SHong Zhang ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 1074c000e2eSHong Zhang /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 108b588c5a2SHong Zhang ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr); 109b588c5a2SHong Zhang 110b588c5a2SHong Zhang /* U part */ 111b588c5a2SHong Zhang pv = b->a + bs2*bi[2*n-i]; 112b588c5a2SHong Zhang pj = b->j + bi[2*n-i]; 113b588c5a2SHong Zhang nz = bi[2*n-i+1] - bi[2*n-i] - 1; 114b588c5a2SHong Zhang for (j=0; j<nz; j++){ 115b588c5a2SHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 116b588c5a2SHong Zhang } 117b588c5a2SHong Zhang } 118b588c5a2SHong Zhang 119b588c5a2SHong Zhang ierr = PetscFree(rtmp);CHKERRQ(ierr); 120b588c5a2SHong Zhang ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 121b588c5a2SHong Zhang ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 122b588c5a2SHong Zhang 123b588c5a2SHong Zhang C->assembled = PETSC_TRUE; 1244c000e2eSHong Zhang ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 1254c000e2eSHong Zhang PetscFunctionReturn(0); 1264c000e2eSHong Zhang } 1274c000e2eSHong Zhang 1280c4413a7SShri Abhyankar #undef __FUNCT__ 1290c4413a7SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_newdatastruct_v2" 1300c4413a7SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info) 1310c4413a7SShri Abhyankar { 1320c4413a7SShri Abhyankar Mat C=B; 1330c4413a7SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 1340c4413a7SShri Abhyankar IS isrow = b->row,isicol = b->icol; 1350c4413a7SShri Abhyankar PetscErrorCode ierr; 1360c4413a7SShri Abhyankar const PetscInt *r,*ic,*ics; 1370c4413a7SShri Abhyankar PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 1380c4413a7SShri Abhyankar PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 1390c4413a7SShri Abhyankar MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 1400c4413a7SShri Abhyankar PetscInt bs2 = a->bs2,flg; 1410c4413a7SShri Abhyankar PetscReal shift = info->shiftinblocks; 1420c4413a7SShri Abhyankar 1430c4413a7SShri Abhyankar PetscFunctionBegin; 1440c4413a7SShri Abhyankar ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1450c4413a7SShri Abhyankar ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 1460c4413a7SShri Abhyankar 1470c4413a7SShri Abhyankar /* generate work space needed by the factorization */ 1480c4413a7SShri Abhyankar ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1490c4413a7SShri Abhyankar mwork = rtmp + bs2*n; 1500c4413a7SShri Abhyankar ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 1510c4413a7SShri Abhyankar ics = ic; 1520c4413a7SShri Abhyankar 1530c4413a7SShri Abhyankar for (i=0; i<n; i++){ 1540c4413a7SShri Abhyankar /* zero rtmp */ 1550c4413a7SShri Abhyankar /* L part */ 1560c4413a7SShri Abhyankar nz = bi[i+1] - bi[i]; 1570c4413a7SShri Abhyankar bjtmp = bj + bi[i]; 1580c4413a7SShri Abhyankar for (j=0; j<nz; j++){ 1590c4413a7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 1600c4413a7SShri Abhyankar } 1610c4413a7SShri Abhyankar 1620c4413a7SShri Abhyankar /* U part */ 1630c4413a7SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 1640c4413a7SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 1650c4413a7SShri Abhyankar for (j=0; j<nz; j++){ 1660c4413a7SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 1670c4413a7SShri Abhyankar } 1680c4413a7SShri Abhyankar 1690c4413a7SShri Abhyankar /* load in initial (unfactored row) */ 1700c4413a7SShri Abhyankar nz = ai[r[i]+1] - ai[r[i]]; 1710c4413a7SShri Abhyankar ajtmp = aj + ai[r[i]]; 1720c4413a7SShri Abhyankar v = aa + bs2*ai[r[i]]; 1730c4413a7SShri Abhyankar for (j=0; j<nz; j++) { 1740c4413a7SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 1750c4413a7SShri Abhyankar } 1760c4413a7SShri Abhyankar 1770c4413a7SShri Abhyankar /* elimination */ 1780c4413a7SShri Abhyankar bjtmp = bj + bi[i]; 1790c4413a7SShri Abhyankar nzL = bi[i+1] - bi[i]; 1800c4413a7SShri Abhyankar for(k=0;k < nzL;k++) { 1810c4413a7SShri Abhyankar row = bjtmp[k]; 1820c4413a7SShri Abhyankar pc = rtmp + bs2*row; 1830c4413a7SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 1840c4413a7SShri Abhyankar if (flg) { 1850c4413a7SShri Abhyankar pv = b->a + bs2*bdiag[row]; 1860c4413a7SShri Abhyankar /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 1870c4413a7SShri Abhyankar ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr); 1880c4413a7SShri Abhyankar 1890c4413a7SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 1900c4413a7SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 1910c4413a7SShri Abhyankar nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 1920c4413a7SShri Abhyankar for (j=0; j<nz; j++) { 1930c4413a7SShri Abhyankar /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 1940c4413a7SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 1950c4413a7SShri Abhyankar v = rtmp + 4*pj[j]; 1960c4413a7SShri Abhyankar ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr); 1970c4413a7SShri Abhyankar pv += 4; 1980c4413a7SShri Abhyankar } 1990c4413a7SShri Abhyankar ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 2000c4413a7SShri Abhyankar } 2010c4413a7SShri Abhyankar } 2020c4413a7SShri Abhyankar 2030c4413a7SShri Abhyankar /* finished row so stick it into b->a */ 2040c4413a7SShri Abhyankar /* L part */ 2050c4413a7SShri Abhyankar pv = b->a + bs2*bi[i] ; 2060c4413a7SShri Abhyankar pj = b->j + bi[i] ; 2070c4413a7SShri Abhyankar nz = bi[i+1] - bi[i]; 2080c4413a7SShri Abhyankar for (j=0; j<nz; j++) { 2090c4413a7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 2100c4413a7SShri Abhyankar } 2110c4413a7SShri Abhyankar 2120c4413a7SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 2130c4413a7SShri Abhyankar pv = b->a + bs2*bdiag[i]; 2140c4413a7SShri Abhyankar pj = b->j + bdiag[i]; 2150c4413a7SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 2160c4413a7SShri Abhyankar /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 2170c4413a7SShri Abhyankar ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr); 2180c4413a7SShri Abhyankar 2190c4413a7SShri Abhyankar /* U part */ 2200c4413a7SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 2210c4413a7SShri Abhyankar pj = b->j + bdiag[i+1]+1; 2220c4413a7SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 2230c4413a7SShri Abhyankar for (j=0; j<nz; j++){ 2240c4413a7SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 2250c4413a7SShri Abhyankar } 2260c4413a7SShri Abhyankar } 2270c4413a7SShri Abhyankar 2280c4413a7SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 2290c4413a7SShri Abhyankar ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2300c4413a7SShri Abhyankar ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2310c4413a7SShri Abhyankar 2320c4413a7SShri Abhyankar C->assembled = PETSC_TRUE; 2330c4413a7SShri Abhyankar ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 2340c4413a7SShri Abhyankar PetscFunctionReturn(0); 2350c4413a7SShri Abhyankar } 2360c4413a7SShri Abhyankar 2374c000e2eSHong Zhang /* 2384c000e2eSHong Zhang MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct - 2394c000e2eSHong Zhang copied from MatLUFactorNumeric_SeqBAIJ_2_newdatastruct(), then remove ordering 2404c000e2eSHong Zhang */ 2414c000e2eSHong Zhang #undef __FUNCT__ 2424c000e2eSHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct" 2434c000e2eSHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 2444c000e2eSHong Zhang { 2454c000e2eSHong Zhang Mat C=B; 2464c000e2eSHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 2474c000e2eSHong Zhang PetscErrorCode ierr; 2484c000e2eSHong Zhang PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 2494c000e2eSHong Zhang PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 2504c000e2eSHong Zhang MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 2514c000e2eSHong Zhang PetscInt bs2 = a->bs2,flg; 2524c000e2eSHong Zhang PetscReal shift = info->shiftinblocks; 2534c000e2eSHong Zhang 2544c000e2eSHong Zhang PetscFunctionBegin; 2554c000e2eSHong Zhang /* generate work space needed by the factorization */ 2564c000e2eSHong Zhang ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 2574c000e2eSHong Zhang mwork = rtmp + bs2*n; 2584c000e2eSHong Zhang ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 2594c000e2eSHong Zhang 2604c000e2eSHong Zhang for (i=0; i<n; i++){ 2614c000e2eSHong Zhang /* zero rtmp */ 2624c000e2eSHong Zhang /* L part */ 2634c000e2eSHong Zhang nz = bi[i+1] - bi[i]; 2644c000e2eSHong Zhang bjtmp = bj + bi[i]; 2654c000e2eSHong Zhang for (j=0; j<nz; j++){ 2664c000e2eSHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 2674c000e2eSHong Zhang } 2684c000e2eSHong Zhang 2694c000e2eSHong Zhang /* U part */ 2704c000e2eSHong Zhang nz = bi[2*n-i+1] - bi[2*n-i]; 2714c000e2eSHong Zhang bjtmp = bj + bi[2*n-i]; 2724c000e2eSHong Zhang for (j=0; j<nz; j++){ 2734c000e2eSHong Zhang ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 2744c000e2eSHong Zhang } 2754c000e2eSHong Zhang 2764c000e2eSHong Zhang /* load in initial (unfactored row) */ 2774c000e2eSHong Zhang nz = ai[i+1] - ai[i]; 2784c000e2eSHong Zhang ajtmp = aj + ai[i]; 2794c000e2eSHong Zhang v = aa + bs2*ai[i]; 2804c000e2eSHong Zhang for (j=0; j<nz; j++) { 2814c000e2eSHong Zhang ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 2824c000e2eSHong Zhang } 2834c000e2eSHong Zhang 2844c000e2eSHong Zhang /* elimination */ 2854c000e2eSHong Zhang bjtmp = bj + bi[i]; 2864c000e2eSHong Zhang nzL = bi[i+1] - bi[i]; 287b1646270SShri Abhyankar for(k=0;k < nzL;k++) { 288b1646270SShri Abhyankar row = bjtmp[k]; 2894c000e2eSHong Zhang pc = rtmp + bs2*row; 2904c000e2eSHong Zhang for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 2914c000e2eSHong Zhang if (flg) { 2924c000e2eSHong Zhang pv = b->a + bs2*bdiag[row]; 2934c000e2eSHong Zhang /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 2944c000e2eSHong Zhang ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr); 2954c000e2eSHong Zhang 2964c000e2eSHong Zhang pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 2974c000e2eSHong Zhang pv = b->a + bs2*bi[2*n-row]; 2984c000e2eSHong Zhang nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 2994c000e2eSHong Zhang for (j=0; j<nz; j++) { 3004c000e2eSHong Zhang /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 3014c000e2eSHong Zhang /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 3024c000e2eSHong Zhang v = rtmp + 4*pj[j]; 3034c000e2eSHong Zhang ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr); 3044c000e2eSHong Zhang pv += 4; 3054c000e2eSHong Zhang } 3064c000e2eSHong Zhang ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 3074c000e2eSHong Zhang } 3084c000e2eSHong Zhang } 3094c000e2eSHong Zhang 3104c000e2eSHong Zhang /* finished row so stick it into b->a */ 3114c000e2eSHong Zhang /* L part */ 3124c000e2eSHong Zhang pv = b->a + bs2*bi[i] ; 3134c000e2eSHong Zhang pj = b->j + bi[i] ; 3144c000e2eSHong Zhang nz = bi[i+1] - bi[i]; 3154c000e2eSHong Zhang for (j=0; j<nz; j++) { 3164c000e2eSHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3174c000e2eSHong Zhang } 3184c000e2eSHong Zhang 3194c000e2eSHong Zhang /* Mark diagonal and invert diagonal for simplier triangular solves */ 3204c000e2eSHong Zhang pv = b->a + bs2*bdiag[i]; 3214c000e2eSHong Zhang pj = b->j + bdiag[i]; 3224c000e2eSHong Zhang ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3234c000e2eSHong Zhang /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 3244c000e2eSHong Zhang ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr); 3254c000e2eSHong Zhang 3264c000e2eSHong Zhang /* U part */ 3274c000e2eSHong Zhang pv = b->a + bs2*bi[2*n-i]; 3284c000e2eSHong Zhang pj = b->j + bi[2*n-i]; 3294c000e2eSHong Zhang nz = bi[2*n-i+1] - bi[2*n-i] - 1; 3304c000e2eSHong Zhang for (j=0; j<nz; j++){ 3314c000e2eSHong Zhang ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3324c000e2eSHong Zhang } 3334c000e2eSHong Zhang } 3344c000e2eSHong Zhang 3354c000e2eSHong Zhang ierr = PetscFree(rtmp);CHKERRQ(ierr); 3364c000e2eSHong Zhang C->assembled = PETSC_TRUE; 3374c000e2eSHong Zhang ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 338b588c5a2SHong Zhang PetscFunctionReturn(0); 339b588c5a2SHong Zhang } 340b588c5a2SHong Zhang 3414a2ae208SSatish Balay #undef __FUNCT__ 342b2b2dd24SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2" 343b2b2dd24SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info) 344b2b2dd24SShri Abhyankar { 345b2b2dd24SShri Abhyankar Mat C=B; 346b2b2dd24SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 347b2b2dd24SShri Abhyankar PetscErrorCode ierr; 348b2b2dd24SShri Abhyankar PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 349b2b2dd24SShri Abhyankar PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 350b2b2dd24SShri Abhyankar MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 351b2b2dd24SShri Abhyankar PetscInt bs2 = a->bs2,flg; 352b2b2dd24SShri Abhyankar PetscReal shift = info->shiftinblocks; 353b2b2dd24SShri Abhyankar 354b2b2dd24SShri Abhyankar PetscFunctionBegin; 355b2b2dd24SShri Abhyankar /* generate work space needed by the factorization */ 356b2b2dd24SShri Abhyankar ierr = PetscMalloc((bs2*n+bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 357b2b2dd24SShri Abhyankar mwork = rtmp + bs2*n; 358b2b2dd24SShri Abhyankar ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 359b2b2dd24SShri Abhyankar 360b2b2dd24SShri Abhyankar for (i=0; i<n; i++){ 361b2b2dd24SShri Abhyankar /* zero rtmp */ 362b2b2dd24SShri Abhyankar /* L part */ 363b2b2dd24SShri Abhyankar nz = bi[i+1] - bi[i]; 364b2b2dd24SShri Abhyankar bjtmp = bj + bi[i]; 365b2b2dd24SShri Abhyankar for (j=0; j<nz; j++){ 366b2b2dd24SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 367b2b2dd24SShri Abhyankar } 368b2b2dd24SShri Abhyankar 369b2b2dd24SShri Abhyankar /* U part */ 370b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i+1]; 371b2b2dd24SShri Abhyankar bjtmp = bj + bdiag[i+1]+1; 372b2b2dd24SShri Abhyankar for (j=0; j<nz; j++){ 373b2b2dd24SShri Abhyankar ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 374b2b2dd24SShri Abhyankar } 375b2b2dd24SShri Abhyankar 376b2b2dd24SShri Abhyankar /* load in initial (unfactored row) */ 377b2b2dd24SShri Abhyankar nz = ai[i+1] - ai[i]; 378b2b2dd24SShri Abhyankar ajtmp = aj + ai[i]; 379b2b2dd24SShri Abhyankar v = aa + bs2*ai[i]; 380b2b2dd24SShri Abhyankar for (j=0; j<nz; j++) { 381b2b2dd24SShri Abhyankar ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 382b2b2dd24SShri Abhyankar } 383b2b2dd24SShri Abhyankar 384b2b2dd24SShri Abhyankar /* elimination */ 385b2b2dd24SShri Abhyankar bjtmp = bj + bi[i]; 386b2b2dd24SShri Abhyankar nzL = bi[i+1] - bi[i]; 387b2b2dd24SShri Abhyankar for(k=0;k < nzL;k++) { 388b2b2dd24SShri Abhyankar row = bjtmp[k]; 389b2b2dd24SShri Abhyankar pc = rtmp + bs2*row; 390b2b2dd24SShri Abhyankar for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 391b2b2dd24SShri Abhyankar if (flg) { 392b2b2dd24SShri Abhyankar pv = b->a + bs2*bdiag[row]; 393b2b2dd24SShri Abhyankar /* Kernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 394b2b2dd24SShri Abhyankar ierr = Kernel_A_gets_A_times_B_2(pc,pv,mwork);CHKERRQ(ierr); 395b2b2dd24SShri Abhyankar 396b2b2dd24SShri Abhyankar pj = b->j + bdiag[row+1]+1; /* beginning of U(row,:) */ 397b2b2dd24SShri Abhyankar pv = b->a + bs2*(bdiag[row+1]+1); 398b2b2dd24SShri Abhyankar nz = bdiag[row]-bdiag[row+1] - 1; /* num of entries in U(row,:) excluding diag */ 399b2b2dd24SShri Abhyankar for (j=0; j<nz; j++) { 400b2b2dd24SShri Abhyankar /* Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 401b2b2dd24SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 402b2b2dd24SShri Abhyankar v = rtmp + 4*pj[j]; 403b2b2dd24SShri Abhyankar ierr = Kernel_A_gets_A_minus_B_times_C_2(v,pc,pv);CHKERRQ(ierr); 404b2b2dd24SShri Abhyankar pv += 4; 405b2b2dd24SShri Abhyankar } 406b2b2dd24SShri Abhyankar ierr = PetscLogFlops(16*nz+12);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 407b2b2dd24SShri Abhyankar } 408b2b2dd24SShri Abhyankar } 409b2b2dd24SShri Abhyankar 410b2b2dd24SShri Abhyankar /* finished row so stick it into b->a */ 411b2b2dd24SShri Abhyankar /* L part */ 412b2b2dd24SShri Abhyankar pv = b->a + bs2*bi[i] ; 413b2b2dd24SShri Abhyankar pj = b->j + bi[i] ; 414b2b2dd24SShri Abhyankar nz = bi[i+1] - bi[i]; 415b2b2dd24SShri Abhyankar for (j=0; j<nz; j++) { 416b2b2dd24SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 417b2b2dd24SShri Abhyankar } 418b2b2dd24SShri Abhyankar 419b2b2dd24SShri Abhyankar /* Mark diagonal and invert diagonal for simplier triangular solves */ 420b2b2dd24SShri Abhyankar pv = b->a + bs2*bdiag[i]; 421b2b2dd24SShri Abhyankar pj = b->j + bdiag[i]; 422b2b2dd24SShri Abhyankar ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 423b2b2dd24SShri Abhyankar /* ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); */ 424b2b2dd24SShri Abhyankar ierr = Kernel_A_gets_inverse_A_2(pv,shift);CHKERRQ(ierr); 425b2b2dd24SShri Abhyankar 426b2b2dd24SShri Abhyankar /* U part */ 427b2b2dd24SShri Abhyankar /* 428b2b2dd24SShri Abhyankar pv = b->a + bs2*bi[2*n-i]; 429b2b2dd24SShri Abhyankar pj = b->j + bi[2*n-i]; 430b2b2dd24SShri Abhyankar nz = bi[2*n-i+1] - bi[2*n-i] - 1; 431b2b2dd24SShri Abhyankar */ 432b2b2dd24SShri Abhyankar pv = b->a + bs2*(bdiag[i+1]+1); 433b2b2dd24SShri Abhyankar pj = b->j + bdiag[i+1]+1; 434b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i+1] - 1; 435b2b2dd24SShri Abhyankar for (j=0; j<nz; j++){ 436b2b2dd24SShri Abhyankar ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 437b2b2dd24SShri Abhyankar } 438b2b2dd24SShri Abhyankar } 439b2b2dd24SShri Abhyankar 440b2b2dd24SShri Abhyankar ierr = PetscFree(rtmp);CHKERRQ(ierr); 441b2b2dd24SShri Abhyankar C->assembled = PETSC_TRUE; 442b2b2dd24SShri Abhyankar ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 443b2b2dd24SShri Abhyankar PetscFunctionReturn(0); 444b2b2dd24SShri Abhyankar } 445b2b2dd24SShri Abhyankar 446b2b2dd24SShri Abhyankar #undef __FUNCT__ 4474a2ae208SSatish Balay #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2" 4480481f469SBarry Smith PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B,Mat A,const MatFactorInfo *info) 4494eeb42bcSBarry Smith { 450719d5645SBarry Smith Mat C = B; 4514eeb42bcSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 4527cdf2dbbSSatish Balay IS isrow = b->row,isicol = b->icol; 4536849ba73SBarry Smith PetscErrorCode ierr; 4545d0c19d7SBarry Smith const PetscInt *r,*ic; 4555d0c19d7SBarry Smith PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 456690b6cddSBarry Smith PetscInt *ajtmpold,*ajtmp,nz,row; 457690b6cddSBarry Smith PetscInt *diag_offset=b->diag,idx,*ai=a->i,*aj=a->j,*pj; 458329f5518SBarry Smith MatScalar *pv,*v,*rtmp,m1,m2,m3,m4,*pc,*w,*x,x1,x2,x3,x4; 4592515f8d2SSatish Balay MatScalar p1,p2,p3,p4; 4603f1db9ecSBarry Smith MatScalar *ba = b->a,*aa = a->a; 46162bba022SBarry Smith PetscReal shift = info->shiftinblocks; 4624eeb42bcSBarry Smith 4633a40ed3dSBarry Smith PetscFunctionBegin; 4644eeb42bcSBarry Smith ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4654eeb42bcSBarry Smith ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 466b0a32e0cSBarry Smith ierr = PetscMalloc(4*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 4674eeb42bcSBarry Smith 4684eeb42bcSBarry Smith for (i=0; i<n; i++) { 4694078e994SBarry Smith nz = bi[i+1] - bi[i]; 4704078e994SBarry Smith ajtmp = bj + bi[i]; 4714eeb42bcSBarry Smith for (j=0; j<nz; j++) { 4724eeb42bcSBarry Smith x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0; 4734eeb42bcSBarry Smith } 4744eeb42bcSBarry Smith /* load in initial (unfactored row) */ 4754eeb42bcSBarry Smith idx = r[i]; 4764078e994SBarry Smith nz = ai[idx+1] - ai[idx]; 4774078e994SBarry Smith ajtmpold = aj + ai[idx]; 4784078e994SBarry Smith v = aa + 4*ai[idx]; 4794eeb42bcSBarry Smith for (j=0; j<nz; j++) { 4804eeb42bcSBarry Smith x = rtmp+4*ic[ajtmpold[j]]; 4814eeb42bcSBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 4824eeb42bcSBarry Smith v += 4; 4834eeb42bcSBarry Smith } 4844eeb42bcSBarry Smith row = *ajtmp++; 4854eeb42bcSBarry Smith while (row < i) { 4864eeb42bcSBarry Smith pc = rtmp + 4*row; 4874eeb42bcSBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 48888685aaeSLois Curfman McInnes if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) { 4894078e994SBarry Smith pv = ba + 4*diag_offset[row]; 4904078e994SBarry Smith pj = bj + diag_offset[row] + 1; 4914eeb42bcSBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 4924eeb42bcSBarry Smith pc[0] = m1 = p1*x1 + p3*x2; 4934eeb42bcSBarry Smith pc[1] = m2 = p2*x1 + p4*x2; 4944eeb42bcSBarry Smith pc[2] = m3 = p1*x3 + p3*x4; 4954eeb42bcSBarry Smith pc[3] = m4 = p2*x3 + p4*x4; 4964078e994SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 4974eeb42bcSBarry Smith pv += 4; 4984eeb42bcSBarry Smith for (j=0; j<nz; j++) { 4994eeb42bcSBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 5004eeb42bcSBarry Smith x = rtmp + 4*pj[j]; 5014eeb42bcSBarry Smith x[0] -= m1*x1 + m3*x2; 5024eeb42bcSBarry Smith x[1] -= m2*x1 + m4*x2; 5034eeb42bcSBarry Smith x[2] -= m1*x3 + m3*x4; 5044eeb42bcSBarry Smith x[3] -= m2*x3 + m4*x4; 5054eeb42bcSBarry Smith pv += 4; 5064eeb42bcSBarry Smith } 507dc0b31edSSatish Balay ierr = PetscLogFlops(16.0*nz+12.0);CHKERRQ(ierr); 5084eeb42bcSBarry Smith } 5094eeb42bcSBarry Smith row = *ajtmp++; 5104eeb42bcSBarry Smith } 5114eeb42bcSBarry Smith /* finished row so stick it into b->a */ 5124078e994SBarry Smith pv = ba + 4*bi[i]; 5134078e994SBarry Smith pj = bj + bi[i]; 5144078e994SBarry Smith nz = bi[i+1] - bi[i]; 5154eeb42bcSBarry Smith for (j=0; j<nz; j++) { 5164eeb42bcSBarry Smith x = rtmp+4*pj[j]; 5174eeb42bcSBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 5184eeb42bcSBarry Smith pv += 4; 5194eeb42bcSBarry Smith } 5204eeb42bcSBarry Smith /* invert diagonal block */ 5214078e994SBarry Smith w = ba + 4*diag_offset[i]; 52262bba022SBarry Smith ierr = Kernel_A_gets_inverse_A_2(w,shift);CHKERRQ(ierr); 5234eeb42bcSBarry Smith } 5244eeb42bcSBarry Smith 525606d414cSSatish Balay ierr = PetscFree(rtmp);CHKERRQ(ierr); 5264eeb42bcSBarry Smith ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5274eeb42bcSBarry Smith ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 528db4efbfdSBarry Smith C->ops->solve = MatSolve_SeqBAIJ_2; 529db4efbfdSBarry Smith C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 5304eeb42bcSBarry Smith C->assembled = PETSC_TRUE; 531efee365bSSatish Balay ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5323a40ed3dSBarry Smith PetscFunctionReturn(0); 5334eeb42bcSBarry Smith } 5344cd38560SBarry Smith /* 5354cd38560SBarry Smith Version for when blocks are 2 by 2 Using natural ordering 5364cd38560SBarry Smith */ 5374a2ae208SSatish Balay #undef __FUNCT__ 5384a2ae208SSatish Balay #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering" 5390481f469SBarry Smith PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info) 5404cd38560SBarry Smith { 5414cd38560SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 542dfbe8321SBarry Smith PetscErrorCode ierr; 543690b6cddSBarry Smith PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 544690b6cddSBarry Smith PetscInt *ajtmpold,*ajtmp,nz,row; 545690b6cddSBarry Smith PetscInt *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 546329f5518SBarry Smith MatScalar *pv,*v,*rtmp,*pc,*w,*x; 5472515f8d2SSatish Balay MatScalar p1,p2,p3,p4,m1,m2,m3,m4,x1,x2,x3,x4; 5484cd38560SBarry Smith MatScalar *ba = b->a,*aa = a->a; 54962bba022SBarry Smith PetscReal shift = info->shiftinblocks; 5504cd38560SBarry Smith 5514cd38560SBarry Smith PetscFunctionBegin; 552b0a32e0cSBarry Smith ierr = PetscMalloc(4*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5534cd38560SBarry Smith for (i=0; i<n; i++) { 5544cd38560SBarry Smith nz = bi[i+1] - bi[i]; 5554cd38560SBarry Smith ajtmp = bj + bi[i]; 5564cd38560SBarry Smith for (j=0; j<nz; j++) { 5574cd38560SBarry Smith x = rtmp+4*ajtmp[j]; 5584cd38560SBarry Smith x[0] = x[1] = x[2] = x[3] = 0.0; 5594cd38560SBarry Smith } 5604cd38560SBarry Smith /* load in initial (unfactored row) */ 5614cd38560SBarry Smith nz = ai[i+1] - ai[i]; 5624cd38560SBarry Smith ajtmpold = aj + ai[i]; 5634cd38560SBarry Smith v = aa + 4*ai[i]; 5644cd38560SBarry Smith for (j=0; j<nz; j++) { 5654cd38560SBarry Smith x = rtmp+4*ajtmpold[j]; 5664cd38560SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 5674cd38560SBarry Smith v += 4; 5684cd38560SBarry Smith } 5694cd38560SBarry Smith row = *ajtmp++; 5704cd38560SBarry Smith while (row < i) { 5714cd38560SBarry Smith pc = rtmp + 4*row; 5724cd38560SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 5734cd38560SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) { 5744cd38560SBarry Smith pv = ba + 4*diag_offset[row]; 5754cd38560SBarry Smith pj = bj + diag_offset[row] + 1; 5764cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 5774cd38560SBarry Smith pc[0] = m1 = p1*x1 + p3*x2; 5784cd38560SBarry Smith pc[1] = m2 = p2*x1 + p4*x2; 5794cd38560SBarry Smith pc[2] = m3 = p1*x3 + p3*x4; 5804cd38560SBarry Smith pc[3] = m4 = p2*x3 + p4*x4; 5814cd38560SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 5824cd38560SBarry Smith pv += 4; 5834cd38560SBarry Smith for (j=0; j<nz; j++) { 5844cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 5854cd38560SBarry Smith x = rtmp + 4*pj[j]; 5864cd38560SBarry Smith x[0] -= m1*x1 + m3*x2; 5874cd38560SBarry Smith x[1] -= m2*x1 + m4*x2; 5884cd38560SBarry Smith x[2] -= m1*x3 + m3*x4; 5894cd38560SBarry Smith x[3] -= m2*x3 + m4*x4; 5904cd38560SBarry Smith pv += 4; 5914cd38560SBarry Smith } 592dc0b31edSSatish Balay ierr = PetscLogFlops(16.0*nz+12.0);CHKERRQ(ierr); 5934cd38560SBarry Smith } 5944cd38560SBarry Smith row = *ajtmp++; 5954cd38560SBarry Smith } 5964cd38560SBarry Smith /* finished row so stick it into b->a */ 5974cd38560SBarry Smith pv = ba + 4*bi[i]; 5984cd38560SBarry Smith pj = bj + bi[i]; 5994cd38560SBarry Smith nz = bi[i+1] - bi[i]; 6004cd38560SBarry Smith for (j=0; j<nz; j++) { 6014cd38560SBarry Smith x = rtmp+4*pj[j]; 6024cd38560SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 6032efa7f71SHong Zhang /* 6042efa7f71SHong Zhang printf(" col %d:",pj[j]); 6052efa7f71SHong Zhang PetscInt j1; 6062efa7f71SHong Zhang for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1)); 6072efa7f71SHong Zhang printf("\n"); 6082efa7f71SHong Zhang */ 6094cd38560SBarry Smith pv += 4; 6104cd38560SBarry Smith } 6114cd38560SBarry Smith /* invert diagonal block */ 6124cd38560SBarry Smith w = ba + 4*diag_offset[i]; 6132efa7f71SHong Zhang /* 6142efa7f71SHong Zhang printf(" \n%d -th: diag: ",i); 6152efa7f71SHong Zhang for (j=0; j<4; j++){ 6162efa7f71SHong Zhang printf(" %g,",w[j]); 6172efa7f71SHong Zhang } 6182efa7f71SHong Zhang printf("\n----------------------------\n"); 6192efa7f71SHong Zhang */ 62062bba022SBarry Smith ierr = Kernel_A_gets_inverse_A_2(w,shift);CHKERRQ(ierr); 6214cd38560SBarry Smith } 6224cd38560SBarry Smith 623606d414cSSatish Balay ierr = PetscFree(rtmp);CHKERRQ(ierr); 624db4efbfdSBarry Smith C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 625db4efbfdSBarry Smith C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 6264cd38560SBarry Smith C->assembled = PETSC_TRUE; 627efee365bSSatish Balay ierr = PetscLogFlops(1.3333*8*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 6284cd38560SBarry Smith PetscFunctionReturn(0); 6294cd38560SBarry Smith } 6307fc0212eSBarry Smith 6317fc0212eSBarry Smith /* ----------------------------------------------------------- */ 6327fc0212eSBarry Smith /* 6337fc0212eSBarry Smith Version for when blocks are 1 by 1. 6347fc0212eSBarry Smith */ 6354a2ae208SSatish Balay #undef __FUNCT__ 6364a2ae208SSatish Balay #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_1" 6370481f469SBarry Smith PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat C,Mat A,const MatFactorInfo *info) 6387fc0212eSBarry Smith { 6397fc0212eSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 6407cdf2dbbSSatish Balay IS isrow = b->row,isicol = b->icol; 6416849ba73SBarry Smith PetscErrorCode ierr; 6425d0c19d7SBarry Smith const PetscInt *r,*ic; 6435d0c19d7SBarry Smith PetscInt i,j,n = a->mbs,*bi = b->i,*bj = b->j; 644690b6cddSBarry Smith PetscInt *ajtmpold,*ajtmp,nz,row,*ai = a->i,*aj = a->j; 645690b6cddSBarry Smith PetscInt *diag_offset = b->diag,diag,*pj; 646329f5518SBarry Smith MatScalar *pv,*v,*rtmp,multiplier,*pc; 6473f1db9ecSBarry Smith MatScalar *ba = b->a,*aa = a->a; 648f58c8c74SBarry Smith PetscTruth row_identity, col_identity; 6497fc0212eSBarry Smith 6503a40ed3dSBarry Smith PetscFunctionBegin; 6517fc0212eSBarry Smith ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 6527fc0212eSBarry Smith ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 653b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 6547fc0212eSBarry Smith 6557fc0212eSBarry Smith for (i=0; i<n; i++) { 6564078e994SBarry Smith nz = bi[i+1] - bi[i]; 6574078e994SBarry Smith ajtmp = bj + bi[i]; 6587fc0212eSBarry Smith for (j=0; j<nz; j++) rtmp[ajtmp[j]] = 0.0; 6597fc0212eSBarry Smith 6607fc0212eSBarry Smith /* load in initial (unfactored row) */ 6614078e994SBarry Smith nz = ai[r[i]+1] - ai[r[i]]; 6624078e994SBarry Smith ajtmpold = aj + ai[r[i]]; 6634078e994SBarry Smith v = aa + ai[r[i]]; 6647fc0212eSBarry Smith for (j=0; j<nz; j++) rtmp[ic[ajtmpold[j]]] = v[j]; 6657fc0212eSBarry Smith 6667fc0212eSBarry Smith row = *ajtmp++; 6677fc0212eSBarry Smith while (row < i) { 6687fc0212eSBarry Smith pc = rtmp + row; 6697fc0212eSBarry Smith if (*pc != 0.0) { 6704078e994SBarry Smith pv = ba + diag_offset[row]; 6714078e994SBarry Smith pj = bj + diag_offset[row] + 1; 6727fc0212eSBarry Smith multiplier = *pc * *pv++; 6737fc0212eSBarry Smith *pc = multiplier; 6744078e994SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 6757fc0212eSBarry Smith for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 676dc0b31edSSatish Balay ierr = PetscLogFlops(1.0+2.0*nz);CHKERRQ(ierr); 6777fc0212eSBarry Smith } 6787fc0212eSBarry Smith row = *ajtmp++; 6797fc0212eSBarry Smith } 6807fc0212eSBarry Smith /* finished row so stick it into b->a */ 6814078e994SBarry Smith pv = ba + bi[i]; 6824078e994SBarry Smith pj = bj + bi[i]; 6834078e994SBarry Smith nz = bi[i+1] - bi[i]; 6847fc0212eSBarry Smith for (j=0; j<nz; j++) {pv[j] = rtmp[pj[j]];} 6854078e994SBarry Smith diag = diag_offset[i] - bi[i]; 6867fc0212eSBarry Smith /* check pivot entry for current row */ 687a73cf429SBarry Smith if (pv[diag] == 0.0) { 6883b4a8b6dSBarry Smith SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot: row in original ordering %D in permuted ordering %D",r[i],i); 6897fc0212eSBarry Smith } 6907fc0212eSBarry Smith pv[diag] = 1.0/pv[diag]; 6917fc0212eSBarry Smith } 6927fc0212eSBarry Smith 693606d414cSSatish Balay ierr = PetscFree(rtmp);CHKERRQ(ierr); 6947fc0212eSBarry Smith ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6957fc0212eSBarry Smith ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 696f58c8c74SBarry Smith ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 697f58c8c74SBarry Smith ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 698f58c8c74SBarry Smith if (row_identity && col_identity) { 699f58c8c74SBarry Smith C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 700f58c8c74SBarry Smith C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 701f58c8c74SBarry Smith } else { 702db4efbfdSBarry Smith C->ops->solve = MatSolve_SeqBAIJ_1; 703db4efbfdSBarry Smith C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 704f58c8c74SBarry Smith } 7057fc0212eSBarry Smith C->assembled = PETSC_TRUE; 706d0f46423SBarry Smith ierr = PetscLogFlops(C->cmap->n);CHKERRQ(ierr); 7073a40ed3dSBarry Smith PetscFunctionReturn(0); 7087fc0212eSBarry Smith } 7097fc0212eSBarry Smith 710e631078cSBarry Smith EXTERN_C_BEGIN 711b24902e0SBarry Smith #undef __FUNCT__ 712b24902e0SBarry Smith #define __FUNCT__ "MatGetFactor_seqbaij_petsc" 713b24902e0SBarry Smith PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A,MatFactorType ftype,Mat *B) 714b24902e0SBarry Smith { 715d0f46423SBarry Smith PetscInt n = A->rmap->n; 716b24902e0SBarry Smith PetscErrorCode ierr; 717b24902e0SBarry Smith 718b24902e0SBarry Smith PetscFunctionBegin; 719b24902e0SBarry Smith ierr = MatCreate(((PetscObject)A)->comm,B);CHKERRQ(ierr); 720b24902e0SBarry Smith ierr = MatSetSizes(*B,n,n,n,n);CHKERRQ(ierr); 721c8342467SHong Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 722b24902e0SBarry Smith ierr = MatSetType(*B,MATSEQBAIJ);CHKERRQ(ierr); 723db4efbfdSBarry Smith (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ; 724db4efbfdSBarry Smith (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ; 725c8342467SHong Zhang (*B)->ops->iludtfactor = MatILUDTFactor_SeqBAIJ; 726b24902e0SBarry Smith } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 727b24902e0SBarry Smith ierr = MatSetType(*B,MATSEQSBAIJ);CHKERRQ(ierr); 7285c9eb25fSBarry Smith ierr = MatSeqSBAIJSetPreallocation(*B,1,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 7295c9eb25fSBarry Smith (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ; 7305c9eb25fSBarry Smith (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ; 731b24902e0SBarry Smith } else SETERRQ(PETSC_ERR_SUP,"Factor type not supported"); 732719d5645SBarry Smith (*B)->factor = ftype; 733b24902e0SBarry Smith PetscFunctionReturn(0); 734b24902e0SBarry Smith } 735e631078cSBarry Smith EXTERN_C_END 736273d9f13SBarry Smith 737db4efbfdSBarry Smith EXTERN_C_BEGIN 738db4efbfdSBarry Smith #undef __FUNCT__ 739db4efbfdSBarry Smith #define __FUNCT__ "MatGetFactorAvailable_seqaij_petsc" 740db4efbfdSBarry Smith PetscErrorCode MatGetFactorAvailable_seqbaij_petsc(Mat A,MatFactorType ftype,PetscTruth *flg) 741db4efbfdSBarry Smith { 742db4efbfdSBarry Smith PetscFunctionBegin; 743db4efbfdSBarry Smith *flg = PETSC_TRUE; 744db4efbfdSBarry Smith PetscFunctionReturn(0); 745db4efbfdSBarry Smith } 746db4efbfdSBarry Smith EXTERN_C_END 747db4efbfdSBarry Smith 7485d517e7eSBarry Smith /* ----------------------------------------------------------- */ 7494a2ae208SSatish Balay #undef __FUNCT__ 7504a2ae208SSatish Balay #define __FUNCT__ "MatLUFactor_SeqBAIJ" 7510481f469SBarry Smith PetscErrorCode MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,const MatFactorInfo *info) 7525d517e7eSBarry Smith { 753dfbe8321SBarry Smith PetscErrorCode ierr; 7545d517e7eSBarry Smith Mat C; 7555d517e7eSBarry Smith 7563a40ed3dSBarry Smith PetscFunctionBegin; 75743244d56SBarry Smith ierr = MatGetFactor(A,MAT_SOLVER_PETSC,MAT_FACTOR_LU,&C);CHKERRQ(ierr); 758719d5645SBarry Smith ierr = MatLUFactorSymbolic(C,A,row,col,info);CHKERRQ(ierr); 759719d5645SBarry Smith ierr = MatLUFactorNumeric(C,A,info);CHKERRQ(ierr); 760db4efbfdSBarry Smith A->ops->solve = C->ops->solve; 761db4efbfdSBarry Smith A->ops->solvetranspose = C->ops->solvetranspose; 762273d9f13SBarry Smith ierr = MatHeaderCopy(A,C);CHKERRQ(ierr); 76352e6d16bSBarry Smith ierr = PetscLogObjectParent(A,((Mat_SeqBAIJ*)(A->data))->icol);CHKERRQ(ierr); 7643a40ed3dSBarry Smith PetscFunctionReturn(0); 7655d517e7eSBarry Smith } 7664cd38560SBarry Smith 7677c4f633dSBarry Smith #include "../src/mat/impls/sbaij/seq/sbaij.h" 768c05c3958SHong Zhang #undef __FUNCT__ 769c05c3958SHong Zhang #define __FUNCT__ "MatCholeskyFactorNumeric_SeqBAIJ_N" 7700481f469SBarry Smith PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C,Mat A,const MatFactorInfo *info) 771c05c3958SHong Zhang { 772f3086b4bSHong Zhang PetscErrorCode ierr; 77378910aadSHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 77478910aadSHong Zhang Mat_SeqSBAIJ *b=(Mat_SeqSBAIJ*)C->data; 77578910aadSHong Zhang IS ip=b->row; 7765d0c19d7SBarry Smith const PetscInt *rip; 7775d0c19d7SBarry Smith PetscInt i,j,mbs=a->mbs,bs=A->rmap->bs,*bi=b->i,*bj=b->j,*bcol; 77878910aadSHong Zhang PetscInt *ai=a->i,*aj=a->j; 77978910aadSHong Zhang PetscInt k,jmin,jmax,*jl,*il,col,nexti,ili,nz; 78078910aadSHong Zhang MatScalar *rtmp,*ba=b->a,*bval,*aa=a->a,dk,uikdi; 7813cea4cbeSHong Zhang PetscReal zeropivot,rs,shiftnz; 782fbf22428SSatish Balay PetscReal shiftpd; 7833cea4cbeSHong Zhang ChShift_Ctx sctx; 7843cea4cbeSHong Zhang PetscInt newshift; 78578910aadSHong Zhang 786c05c3958SHong Zhang PetscFunctionBegin; 7876ad2eaddSHong Zhang if (bs > 1) { 7886ad2eaddSHong Zhang if (!a->sbaijMat){ 789ceb03754SKris Buschelman ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr); 7906ad2eaddSHong Zhang } 791719d5645SBarry Smith ierr = (a->sbaijMat)->ops->choleskyfactornumeric(C,a->sbaijMat,info);CHKERRQ(ierr); 7926ad2eaddSHong Zhang ierr = MatDestroy(a->sbaijMat);CHKERRQ(ierr); 7936ad2eaddSHong Zhang a->sbaijMat = PETSC_NULL; 7946ad2eaddSHong Zhang PetscFunctionReturn(0); 7956ad2eaddSHong Zhang } 79678910aadSHong Zhang 79778910aadSHong Zhang /* initialization */ 7983cea4cbeSHong Zhang shiftnz = info->shiftnz; 7993cea4cbeSHong Zhang shiftpd = info->shiftpd; 8003cea4cbeSHong Zhang zeropivot = info->zeropivot; 8013cea4cbeSHong Zhang 8026ad2eaddSHong Zhang ierr = ISGetIndices(ip,&rip);CHKERRQ(ierr); 80378910aadSHong Zhang nz = (2*mbs+1)*sizeof(PetscInt)+mbs*sizeof(MatScalar); 80478910aadSHong Zhang ierr = PetscMalloc(nz,&il);CHKERRQ(ierr); 80578910aadSHong Zhang jl = il + mbs; 80678910aadSHong Zhang rtmp = (MatScalar*)(jl + mbs); 80778910aadSHong Zhang 8083cea4cbeSHong Zhang sctx.shift_amount = 0; 8093cea4cbeSHong Zhang sctx.nshift = 0; 81078910aadSHong Zhang do { 8113cea4cbeSHong Zhang sctx.chshift = PETSC_FALSE; 81278910aadSHong Zhang for (i=0; i<mbs; i++) { 81378910aadSHong Zhang rtmp[i] = 0.0; jl[i] = mbs; il[0] = 0; 81478910aadSHong Zhang } 81578910aadSHong Zhang 81678910aadSHong Zhang for (k = 0; k<mbs; k++){ 81778910aadSHong Zhang bval = ba + bi[k]; 81878910aadSHong Zhang /* initialize k-th row by the perm[k]-th row of A */ 81978910aadSHong Zhang jmin = ai[rip[k]]; jmax = ai[rip[k]+1]; 82078910aadSHong Zhang for (j = jmin; j < jmax; j++){ 82178910aadSHong Zhang col = rip[aj[j]]; 82278910aadSHong Zhang if (col >= k){ /* only take upper triangular entry */ 82378910aadSHong Zhang rtmp[col] = aa[j]; 82478910aadSHong Zhang *bval++ = 0.0; /* for in-place factorization */ 82578910aadSHong Zhang } 82678910aadSHong Zhang } 8273cea4cbeSHong Zhang 8283cea4cbeSHong Zhang /* shift the diagonal of the matrix */ 8293cea4cbeSHong Zhang if (sctx.nshift) rtmp[k] += sctx.shift_amount; 83078910aadSHong Zhang 83178910aadSHong Zhang /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 83278910aadSHong Zhang dk = rtmp[k]; 83378910aadSHong Zhang i = jl[k]; /* first row to be added to k_th row */ 83478910aadSHong Zhang 83578910aadSHong Zhang while (i < k){ 83678910aadSHong Zhang nexti = jl[i]; /* next row to be added to k_th row */ 83778910aadSHong Zhang 83878910aadSHong Zhang /* compute multiplier, update diag(k) and U(i,k) */ 83978910aadSHong Zhang ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 84078910aadSHong Zhang uikdi = - ba[ili]*ba[bi[i]]; /* diagonal(k) */ 84178910aadSHong Zhang dk += uikdi*ba[ili]; 84278910aadSHong Zhang ba[ili] = uikdi; /* -U(i,k) */ 84378910aadSHong Zhang 84478910aadSHong Zhang /* add multiple of row i to k-th row */ 84578910aadSHong Zhang jmin = ili + 1; jmax = bi[i+1]; 84678910aadSHong Zhang if (jmin < jmax){ 84778910aadSHong Zhang for (j=jmin; j<jmax; j++) rtmp[bj[j]] += uikdi*ba[j]; 84878910aadSHong Zhang /* update il and jl for row i */ 84978910aadSHong Zhang il[i] = jmin; 85078910aadSHong Zhang j = bj[jmin]; jl[i] = jl[j]; jl[j] = i; 85178910aadSHong Zhang } 85278910aadSHong Zhang i = nexti; 85378910aadSHong Zhang } 85478910aadSHong Zhang 8553cea4cbeSHong Zhang /* shift the diagonals when zero pivot is detected */ 8563cea4cbeSHong Zhang /* compute rs=sum of abs(off-diagonal) */ 8573cea4cbeSHong Zhang rs = 0.0; 85878910aadSHong Zhang jmin = bi[k]+1; 85978910aadSHong Zhang nz = bi[k+1] - jmin; 86078910aadSHong Zhang if (nz){ 86178910aadSHong Zhang bcol = bj + jmin; 86278910aadSHong Zhang while (nz--){ 86389f845c8SHong Zhang rs += PetscAbsScalar(rtmp[*bcol]); 86489f845c8SHong Zhang bcol++; 86578910aadSHong Zhang } 86678910aadSHong Zhang } 8673cea4cbeSHong Zhang 8683cea4cbeSHong Zhang sctx.rs = rs; 8693cea4cbeSHong Zhang sctx.pv = dk; 87045938b79SHong Zhang ierr = MatCholeskyCheckShift_inline(info,sctx,k,newshift);CHKERRQ(ierr); 87145938b79SHong Zhang if (newshift == 1) break; 87278910aadSHong Zhang 87378910aadSHong Zhang /* copy data into U(k,:) */ 87478910aadSHong Zhang ba[bi[k]] = 1.0/dk; /* U(k,k) */ 87578910aadSHong Zhang jmin = bi[k]+1; jmax = bi[k+1]; 87678910aadSHong Zhang if (jmin < jmax) { 87778910aadSHong Zhang for (j=jmin; j<jmax; j++){ 87878910aadSHong Zhang col = bj[j]; ba[j] = rtmp[col]; rtmp[col] = 0.0; 87978910aadSHong Zhang } 88078910aadSHong Zhang /* add the k-th row into il and jl */ 88178910aadSHong Zhang il[k] = jmin; 88278910aadSHong Zhang i = bj[jmin]; jl[k] = jl[i]; jl[i] = k; 88378910aadSHong Zhang } 88478910aadSHong Zhang } 8853cea4cbeSHong Zhang } while (sctx.chshift); 88678910aadSHong Zhang ierr = PetscFree(il);CHKERRQ(ierr); 88778910aadSHong Zhang 88878910aadSHong Zhang ierr = ISRestoreIndices(ip,&rip);CHKERRQ(ierr); 88978910aadSHong Zhang C->assembled = PETSC_TRUE; 89078910aadSHong Zhang C->preallocated = PETSC_TRUE; 891d0f46423SBarry Smith ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr); 8923cea4cbeSHong Zhang if (sctx.nshift){ 893e738e422SBarry Smith if (shiftpd) { 8941e2582c4SBarry Smith ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 895e738e422SBarry Smith } else if (shiftnz) { 896e738e422SBarry Smith ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 89778910aadSHong Zhang } 89878910aadSHong Zhang } 899c05c3958SHong Zhang PetscFunctionReturn(0); 900c05c3958SHong Zhang } 9014cd38560SBarry Smith 902c05c3958SHong Zhang #undef __FUNCT__ 903c05c3958SHong Zhang #define __FUNCT__ "MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering" 9040481f469SBarry Smith PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C,Mat A,const MatFactorInfo *info) 905c05c3958SHong Zhang { 90678910aadSHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 90778910aadSHong Zhang Mat_SeqSBAIJ *b=(Mat_SeqSBAIJ*)C->data; 90878910aadSHong Zhang PetscErrorCode ierr; 9093cea4cbeSHong Zhang PetscInt i,j,am=a->mbs; 91078910aadSHong Zhang PetscInt *ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 9113cea4cbeSHong Zhang PetscInt k,jmin,*jl,*il,nexti,ili,*acol,*bcol,nz; 91278910aadSHong Zhang MatScalar *rtmp,*ba=b->a,*aa=a->a,dk,uikdi,*aval,*bval; 9133cea4cbeSHong Zhang PetscReal zeropivot,rs,shiftnz; 914fbf22428SSatish Balay PetscReal shiftpd; 9153cea4cbeSHong Zhang ChShift_Ctx sctx; 9163cea4cbeSHong Zhang PetscInt newshift; 91778910aadSHong Zhang 918c05c3958SHong Zhang PetscFunctionBegin; 91978910aadSHong Zhang /* initialization */ 9203cea4cbeSHong Zhang shiftnz = info->shiftnz; 9213cea4cbeSHong Zhang shiftpd = info->shiftpd; 9223cea4cbeSHong Zhang zeropivot = info->zeropivot; 9233cea4cbeSHong Zhang 92478910aadSHong Zhang nz = (2*am+1)*sizeof(PetscInt)+am*sizeof(MatScalar); 92578910aadSHong Zhang ierr = PetscMalloc(nz,&il);CHKERRQ(ierr); 92678910aadSHong Zhang jl = il + am; 92778910aadSHong Zhang rtmp = (MatScalar*)(jl + am); 92878910aadSHong Zhang 9293cea4cbeSHong Zhang sctx.shift_amount = 0; 9303cea4cbeSHong Zhang sctx.nshift = 0; 93178910aadSHong Zhang do { 9323cea4cbeSHong Zhang sctx.chshift = PETSC_FALSE; 93378910aadSHong Zhang for (i=0; i<am; i++) { 93478910aadSHong Zhang rtmp[i] = 0.0; jl[i] = am; il[0] = 0; 93578910aadSHong Zhang } 93678910aadSHong Zhang 93778910aadSHong Zhang for (k = 0; k<am; k++){ 93878910aadSHong Zhang /* initialize k-th row with elements nonzero in row perm(k) of A */ 93978910aadSHong Zhang nz = ai[k+1] - ai[k]; 94078910aadSHong Zhang acol = aj + ai[k]; 94178910aadSHong Zhang aval = aa + ai[k]; 94278910aadSHong Zhang bval = ba + bi[k]; 94378910aadSHong Zhang while (nz -- ){ 94478910aadSHong Zhang if (*acol < k) { /* skip lower triangular entries */ 94578910aadSHong Zhang acol++; aval++; 94678910aadSHong Zhang } else { 94778910aadSHong Zhang rtmp[*acol++] = *aval++; 94878910aadSHong Zhang *bval++ = 0.0; /* for in-place factorization */ 94978910aadSHong Zhang } 95078910aadSHong Zhang } 9513cea4cbeSHong Zhang 9523cea4cbeSHong Zhang /* shift the diagonal of the matrix */ 9533cea4cbeSHong Zhang if (sctx.nshift) rtmp[k] += sctx.shift_amount; 95478910aadSHong Zhang 95578910aadSHong Zhang /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 95678910aadSHong Zhang dk = rtmp[k]; 95778910aadSHong Zhang i = jl[k]; /* first row to be added to k_th row */ 95878910aadSHong Zhang 95978910aadSHong Zhang while (i < k){ 96078910aadSHong Zhang nexti = jl[i]; /* next row to be added to k_th row */ 96178910aadSHong Zhang /* compute multiplier, update D(k) and U(i,k) */ 96278910aadSHong Zhang ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 96378910aadSHong Zhang uikdi = - ba[ili]*ba[bi[i]]; 96478910aadSHong Zhang dk += uikdi*ba[ili]; 96578910aadSHong Zhang ba[ili] = uikdi; /* -U(i,k) */ 96678910aadSHong Zhang 96778910aadSHong Zhang /* add multiple of row i to k-th row ... */ 96878910aadSHong Zhang jmin = ili + 1; 96978910aadSHong Zhang nz = bi[i+1] - jmin; 97078910aadSHong Zhang if (nz > 0){ 97178910aadSHong Zhang bcol = bj + jmin; 97278910aadSHong Zhang bval = ba + jmin; 97378910aadSHong Zhang while (nz --) rtmp[*bcol++] += uikdi*(*bval++); 97478910aadSHong Zhang /* update il and jl for i-th row */ 97578910aadSHong Zhang il[i] = jmin; 97678910aadSHong Zhang j = bj[jmin]; jl[i] = jl[j]; jl[j] = i; 97778910aadSHong Zhang } 97878910aadSHong Zhang i = nexti; 97978910aadSHong Zhang } 98078910aadSHong Zhang 9813cea4cbeSHong Zhang /* shift the diagonals when zero pivot is detected */ 9823cea4cbeSHong Zhang /* compute rs=sum of abs(off-diagonal) */ 9833cea4cbeSHong Zhang rs = 0.0; 98478910aadSHong Zhang jmin = bi[k]+1; 98578910aadSHong Zhang nz = bi[k+1] - jmin; 98678910aadSHong Zhang if (nz){ 98778910aadSHong Zhang bcol = bj + jmin; 98878910aadSHong Zhang while (nz--){ 98989f845c8SHong Zhang rs += PetscAbsScalar(rtmp[*bcol]); 99089f845c8SHong Zhang bcol++; 99178910aadSHong Zhang } 99278910aadSHong Zhang } 9933cea4cbeSHong Zhang 9943cea4cbeSHong Zhang sctx.rs = rs; 9953cea4cbeSHong Zhang sctx.pv = dk; 99645938b79SHong Zhang ierr = MatCholeskyCheckShift_inline(info,sctx,k,newshift);CHKERRQ(ierr); 99745938b79SHong Zhang if (newshift == 1) break; /* sctx.shift_amount is updated */ 99878910aadSHong Zhang 99978910aadSHong Zhang /* copy data into U(k,:) */ 100078910aadSHong Zhang ba[bi[k]] = 1.0/dk; 100178910aadSHong Zhang jmin = bi[k]+1; 100278910aadSHong Zhang nz = bi[k+1] - jmin; 100378910aadSHong Zhang if (nz){ 100478910aadSHong Zhang bcol = bj + jmin; 100578910aadSHong Zhang bval = ba + jmin; 100678910aadSHong Zhang while (nz--){ 100778910aadSHong Zhang *bval++ = rtmp[*bcol]; 100878910aadSHong Zhang rtmp[*bcol++] = 0.0; 100978910aadSHong Zhang } 101078910aadSHong Zhang /* add k-th row into il and jl */ 101178910aadSHong Zhang il[k] = jmin; 101278910aadSHong Zhang i = bj[jmin]; jl[k] = jl[i]; jl[i] = k; 101378910aadSHong Zhang } 101478910aadSHong Zhang } 10153cea4cbeSHong Zhang } while (sctx.chshift); 101678910aadSHong Zhang ierr = PetscFree(il);CHKERRQ(ierr); 101778910aadSHong Zhang 1018719d5645SBarry Smith C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering; 1019719d5645SBarry Smith C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering; 102078910aadSHong Zhang C->assembled = PETSC_TRUE; 102178910aadSHong Zhang C->preallocated = PETSC_TRUE; 1022d0f46423SBarry Smith ierr = PetscLogFlops(C->rmap->N);CHKERRQ(ierr); 10233cea4cbeSHong Zhang if (sctx.nshift){ 10243cea4cbeSHong Zhang if (shiftnz) { 10251e2582c4SBarry Smith ierr = PetscInfo2(A,"number of shiftnz tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 10263cea4cbeSHong Zhang } else if (shiftpd) { 10271e2582c4SBarry Smith ierr = PetscInfo2(A,"number of shiftpd tries %D, shift_amount %G\n",sctx.nshift,sctx.shift_amount);CHKERRQ(ierr); 102878910aadSHong Zhang } 102978910aadSHong Zhang } 1030c05c3958SHong Zhang PetscFunctionReturn(0); 1031c05c3958SHong Zhang } 1032c05c3958SHong Zhang 1033c05c3958SHong Zhang #include "petscbt.h" 10347c4f633dSBarry Smith #include "../src/mat/utils/freespace.h" 1035c05c3958SHong Zhang #undef __FUNCT__ 1036c05c3958SHong Zhang #define __FUNCT__ "MatICCFactorSymbolic_SeqBAIJ" 10370481f469SBarry Smith PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 1038c05c3958SHong Zhang { 103978910aadSHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 104078910aadSHong Zhang Mat_SeqSBAIJ *b; 104178910aadSHong Zhang Mat B; 104278910aadSHong Zhang PetscErrorCode ierr; 104378910aadSHong Zhang PetscTruth perm_identity; 10445d0c19d7SBarry Smith PetscInt reallocs=0,i,*ai=a->i,*aj=a->j,am=a->mbs,bs=A->rmap->bs,*ui; 10455d0c19d7SBarry Smith const PetscInt *rip; 104678910aadSHong Zhang PetscInt jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow; 1047cfdb8b8aSHong Zhang PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,ncols_upper,*cols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr; 104878910aadSHong Zhang PetscReal fill=info->fill,levels=info->levels; 1049a1a86e44SBarry Smith PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 1050a1a86e44SBarry Smith PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 105178910aadSHong Zhang PetscBT lnkbt; 105278910aadSHong Zhang 1053c05c3958SHong Zhang PetscFunctionBegin; 10546ad2eaddSHong Zhang if (bs > 1){ 10556ad2eaddSHong Zhang if (!a->sbaijMat){ 1056ceb03754SKris Buschelman ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr); 10576ad2eaddSHong Zhang } 1058719d5645SBarry Smith (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1059719d5645SBarry Smith ierr = MatICCFactorSymbolic(fact,a->sbaijMat,perm,info);CHKERRQ(ierr); 10606ad2eaddSHong Zhang PetscFunctionReturn(0); 10616ad2eaddSHong Zhang } 10626ad2eaddSHong Zhang 106378910aadSHong Zhang ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 106478910aadSHong Zhang ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 106578910aadSHong Zhang 106678910aadSHong Zhang /* special case that simply copies fill pattern */ 106778910aadSHong Zhang if (!levels && perm_identity) { 106878910aadSHong Zhang ierr = MatMarkDiagonal_SeqBAIJ(A);CHKERRQ(ierr); 106978910aadSHong Zhang ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr); 107078910aadSHong Zhang for (i=0; i<am; i++) { 107178910aadSHong Zhang ui[i] = ai[i+1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */ 107278910aadSHong Zhang } 1073719d5645SBarry Smith B = fact; 107478910aadSHong Zhang ierr = MatSeqSBAIJSetPreallocation(B,1,0,ui);CHKERRQ(ierr); 107578910aadSHong Zhang 1076b24902e0SBarry Smith 107778910aadSHong Zhang b = (Mat_SeqSBAIJ*)B->data; 107878910aadSHong Zhang uj = b->j; 107978910aadSHong Zhang for (i=0; i<am; i++) { 108078910aadSHong Zhang aj = a->j + a->diag[i]; 108178910aadSHong Zhang for (j=0; j<ui[i]; j++){ 108278910aadSHong Zhang *uj++ = *aj++; 108378910aadSHong Zhang } 108478910aadSHong Zhang b->ilen[i] = ui[i]; 108578910aadSHong Zhang } 108678910aadSHong Zhang ierr = PetscFree(ui);CHKERRQ(ierr); 1087719d5645SBarry Smith B->factor = MAT_FACTOR_NONE; 108878910aadSHong Zhang ierr = MatAssemblyBegin(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 108978910aadSHong Zhang ierr = MatAssemblyEnd(B,MAT_FINAL_ASSEMBLY);CHKERRQ(ierr); 1090719d5645SBarry Smith B->factor = MAT_FACTOR_ICC; 109178910aadSHong Zhang 109278910aadSHong Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 109378910aadSHong Zhang PetscFunctionReturn(0); 109478910aadSHong Zhang } 109578910aadSHong Zhang 109678910aadSHong Zhang /* initialization */ 109778910aadSHong Zhang ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr); 109878910aadSHong Zhang ui[0] = 0; 109978910aadSHong Zhang ierr = PetscMalloc((2*am+1)*sizeof(PetscInt),&cols_lvl);CHKERRQ(ierr); 110078910aadSHong Zhang 110178910aadSHong Zhang /* jl: linked list for storing indices of the pivot rows 110278910aadSHong Zhang il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 110378910aadSHong Zhang ierr = PetscMalloc((2*am+1)*sizeof(PetscInt)+2*am*sizeof(PetscInt*),&jl);CHKERRQ(ierr); 110478910aadSHong Zhang il = jl + am; 110578910aadSHong Zhang uj_ptr = (PetscInt**)(il + am); 110678910aadSHong Zhang uj_lvl_ptr = (PetscInt**)(uj_ptr + am); 110778910aadSHong Zhang for (i=0; i<am; i++){ 110878910aadSHong Zhang jl[i] = am; il[i] = 0; 110978910aadSHong Zhang } 111078910aadSHong Zhang 111178910aadSHong Zhang /* create and initialize a linked list for storing column indices of the active row k */ 111278910aadSHong Zhang nlnk = am + 1; 111378910aadSHong Zhang ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 111478910aadSHong Zhang 111578910aadSHong Zhang /* initial FreeSpace size is fill*(ai[am]+1) */ 1116a1a86e44SBarry Smith ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr); 111778910aadSHong Zhang current_space = free_space; 1118a1a86e44SBarry Smith ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr); 111978910aadSHong Zhang current_space_lvl = free_space_lvl; 112078910aadSHong Zhang 112178910aadSHong Zhang for (k=0; k<am; k++){ /* for each active row k */ 112278910aadSHong Zhang /* initialize lnk by the column indices of row rip[k] of A */ 112378910aadSHong Zhang nzk = 0; 112478910aadSHong Zhang ncols = ai[rip[k]+1] - ai[rip[k]]; 112578910aadSHong Zhang ncols_upper = 0; 112678910aadSHong Zhang cols = cols_lvl + am; 112778910aadSHong Zhang for (j=0; j<ncols; j++){ 112878910aadSHong Zhang i = rip[*(aj + ai[rip[k]] + j)]; 112978910aadSHong Zhang if (i >= k){ /* only take upper triangular entry */ 113078910aadSHong Zhang cols[ncols_upper] = i; 113178910aadSHong Zhang cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */ 113278910aadSHong Zhang ncols_upper++; 113378910aadSHong Zhang } 113478910aadSHong Zhang } 113578910aadSHong Zhang ierr = PetscIncompleteLLAdd(ncols_upper,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 113678910aadSHong Zhang nzk += nlnk; 113778910aadSHong Zhang 113878910aadSHong Zhang /* update lnk by computing fill-in for each pivot row to be merged in */ 113978910aadSHong Zhang prow = jl[k]; /* 1st pivot row */ 114078910aadSHong Zhang 114178910aadSHong Zhang while (prow < k){ 114278910aadSHong Zhang nextprow = jl[prow]; 114378910aadSHong Zhang 114478910aadSHong Zhang /* merge prow into k-th row */ 114578910aadSHong Zhang jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 114678910aadSHong Zhang jmax = ui[prow+1]; 114778910aadSHong Zhang ncols = jmax-jmin; 114878910aadSHong Zhang i = jmin - ui[prow]; 114978910aadSHong Zhang cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 115078910aadSHong Zhang for (j=0; j<ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j); 11515a8e39fbSHong Zhang ierr = PetscIncompleteLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 115278910aadSHong Zhang nzk += nlnk; 115378910aadSHong Zhang 115478910aadSHong Zhang /* update il and jl for prow */ 115578910aadSHong Zhang if (jmin < jmax){ 115678910aadSHong Zhang il[prow] = jmin; 115778910aadSHong Zhang j = *cols; jl[prow] = jl[j]; jl[j] = prow; 115878910aadSHong Zhang } 115978910aadSHong Zhang prow = nextprow; 116078910aadSHong Zhang } 116178910aadSHong Zhang 116278910aadSHong Zhang /* if free space is not available, make more free space */ 116378910aadSHong Zhang if (current_space->local_remaining<nzk) { 116478910aadSHong Zhang i = am - k + 1; /* num of unfactored rows */ 116578910aadSHong Zhang i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1166a1a86e44SBarry Smith ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 1167a1a86e44SBarry Smith ierr = PetscFreeSpaceGet(i,¤t_space_lvl);CHKERRQ(ierr); 116878910aadSHong Zhang reallocs++; 116978910aadSHong Zhang } 117078910aadSHong Zhang 117178910aadSHong Zhang /* copy data into free_space and free_space_lvl, then initialize lnk */ 117278910aadSHong Zhang ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 117378910aadSHong Zhang 117478910aadSHong Zhang /* add the k-th row into il and jl */ 117578910aadSHong Zhang if (nzk-1 > 0){ 117678910aadSHong Zhang i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 117778910aadSHong Zhang jl[k] = jl[i]; jl[i] = k; 117878910aadSHong Zhang il[k] = ui[k] + 1; 117978910aadSHong Zhang } 118078910aadSHong Zhang uj_ptr[k] = current_space->array; 118178910aadSHong Zhang uj_lvl_ptr[k] = current_space_lvl->array; 118278910aadSHong Zhang 118378910aadSHong Zhang current_space->array += nzk; 118478910aadSHong Zhang current_space->local_used += nzk; 118578910aadSHong Zhang current_space->local_remaining -= nzk; 118678910aadSHong Zhang 118778910aadSHong Zhang current_space_lvl->array += nzk; 118878910aadSHong Zhang current_space_lvl->local_used += nzk; 118978910aadSHong Zhang current_space_lvl->local_remaining -= nzk; 119078910aadSHong Zhang 119178910aadSHong Zhang ui[k+1] = ui[k] + nzk; 119278910aadSHong Zhang } 119378910aadSHong Zhang 11946cf91177SBarry Smith #if defined(PETSC_USE_INFO) 119578910aadSHong Zhang if (ai[am] != 0) { 119678910aadSHong Zhang PetscReal af = ((PetscReal)(2*ui[am]-am))/((PetscReal)ai[am]); 1197ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr); 1198ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 1199ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr); 120078910aadSHong Zhang } else { 1201ae15b995SBarry Smith ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 120278910aadSHong Zhang } 120363ba0a88SBarry Smith #endif 120478910aadSHong Zhang 120578910aadSHong Zhang ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 120678910aadSHong Zhang ierr = PetscFree(jl);CHKERRQ(ierr); 120778910aadSHong Zhang ierr = PetscFree(cols_lvl);CHKERRQ(ierr); 120878910aadSHong Zhang 120978910aadSHong Zhang /* destroy list of free space and other temporary array(s) */ 121078910aadSHong Zhang ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr); 1211a1a86e44SBarry Smith ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr); 121278910aadSHong Zhang ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 1213a1a86e44SBarry Smith ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 121478910aadSHong Zhang 121578910aadSHong Zhang /* put together the new matrix in MATSEQSBAIJ format */ 1216719d5645SBarry Smith B = fact; 1217ab93d7beSBarry Smith ierr = MatSeqSBAIJSetPreallocation(B,1,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 121878910aadSHong Zhang 121978910aadSHong Zhang b = (Mat_SeqSBAIJ*)B->data; 122078910aadSHong Zhang b->singlemalloc = PETSC_FALSE; 1221e6b907acSBarry Smith b->free_a = PETSC_TRUE; 1222e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 122378910aadSHong Zhang ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 122478910aadSHong Zhang b->j = uj; 122578910aadSHong Zhang b->i = ui; 122678910aadSHong Zhang b->diag = 0; 122778910aadSHong Zhang b->ilen = 0; 122878910aadSHong Zhang b->imax = 0; 122978910aadSHong Zhang b->row = perm; 123078910aadSHong Zhang b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 123178910aadSHong Zhang ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 123278910aadSHong Zhang b->icol = perm; 123378910aadSHong Zhang ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 123478910aadSHong Zhang ierr = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 123552e6d16bSBarry Smith ierr = PetscLogObjectMemory(B,(ui[am]-am)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 123678910aadSHong Zhang b->maxnz = b->nz = ui[am]; 123778910aadSHong Zhang 123878910aadSHong Zhang B->info.factor_mallocs = reallocs; 123978910aadSHong Zhang B->info.fill_ratio_given = fill; 124078910aadSHong Zhang if (ai[am] != 0) { 124178910aadSHong Zhang B->info.fill_ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]); 124278910aadSHong Zhang } else { 124378910aadSHong Zhang B->info.fill_ratio_needed = 0.0; 124478910aadSHong Zhang } 124578910aadSHong Zhang if (perm_identity){ 124678910aadSHong Zhang B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering; 124778910aadSHong Zhang B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering; 124878910aadSHong Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 124978910aadSHong Zhang } else { 1250719d5645SBarry Smith (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 125178910aadSHong Zhang } 1252c05c3958SHong Zhang PetscFunctionReturn(0); 1253c05c3958SHong Zhang } 1254c05c3958SHong Zhang 1255c05c3958SHong Zhang #undef __FUNCT__ 1256c05c3958SHong Zhang #define __FUNCT__ "MatCholeskyFactorSymbolic_SeqBAIJ" 12570481f469SBarry Smith PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info) 1258c05c3958SHong Zhang { 125978910aadSHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 126078910aadSHong Zhang Mat_SeqSBAIJ *b; 126178910aadSHong Zhang Mat B; 126278910aadSHong Zhang PetscErrorCode ierr; 126378910aadSHong Zhang PetscTruth perm_identity; 126478910aadSHong Zhang PetscReal fill = info->fill; 12655d0c19d7SBarry Smith const PetscInt *rip; 12665d0c19d7SBarry Smith PetscInt i,mbs=a->mbs,bs=A->rmap->bs,*ai=a->i,*aj=a->j,reallocs=0,prow; 126778910aadSHong Zhang PetscInt *jl,jmin,jmax,nzk,*ui,k,j,*il,nextprow; 126878910aadSHong Zhang PetscInt nlnk,*lnk,ncols,ncols_upper,*cols,*uj,**ui_ptr,*uj_ptr; 1269a1a86e44SBarry Smith PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 127078910aadSHong Zhang PetscBT lnkbt; 127178910aadSHong Zhang 1272c05c3958SHong Zhang PetscFunctionBegin; 12736ad2eaddSHong Zhang if (bs > 1) { /* convert to seqsbaij */ 12746ad2eaddSHong Zhang if (!a->sbaijMat){ 1275ceb03754SKris Buschelman ierr = MatConvert(A,MATSEQSBAIJ,MAT_INITIAL_MATRIX,&a->sbaijMat);CHKERRQ(ierr); 12766ad2eaddSHong Zhang } 1277719d5645SBarry Smith (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 1278719d5645SBarry Smith ierr = MatCholeskyFactorSymbolic(fact,a->sbaijMat,perm,info);CHKERRQ(ierr); 12796ad2eaddSHong Zhang PetscFunctionReturn(0); 12806ad2eaddSHong Zhang } 12816ad2eaddSHong Zhang 128278910aadSHong Zhang /* check whether perm is the identity mapping */ 128378910aadSHong Zhang ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr); 1284c84f5b01SHong Zhang if (!perm_identity) SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported"); 128578910aadSHong Zhang ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr); 128678910aadSHong Zhang 128778910aadSHong Zhang /* initialization */ 128878910aadSHong Zhang ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr); 128978910aadSHong Zhang ui[0] = 0; 129078910aadSHong Zhang 129178910aadSHong Zhang /* jl: linked list for storing indices of the pivot rows 129278910aadSHong Zhang il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */ 129378910aadSHong Zhang ierr = PetscMalloc((3*mbs+1)*sizeof(PetscInt)+mbs*sizeof(PetscInt*),&jl);CHKERRQ(ierr); 129478910aadSHong Zhang il = jl + mbs; 129578910aadSHong Zhang cols = il + mbs; 129678910aadSHong Zhang ui_ptr = (PetscInt**)(cols + mbs); 129778910aadSHong Zhang for (i=0; i<mbs; i++){ 129878910aadSHong Zhang jl[i] = mbs; il[i] = 0; 129978910aadSHong Zhang } 130078910aadSHong Zhang 130178910aadSHong Zhang /* create and initialize a linked list for storing column indices of the active row k */ 130278910aadSHong Zhang nlnk = mbs + 1; 130378910aadSHong Zhang ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr); 130478910aadSHong Zhang 130578910aadSHong Zhang /* initial FreeSpace size is fill*(ai[mbs]+1) */ 1306a1a86e44SBarry Smith ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[mbs]+1)),&free_space);CHKERRQ(ierr); 130778910aadSHong Zhang current_space = free_space; 130878910aadSHong Zhang 130978910aadSHong Zhang for (k=0; k<mbs; k++){ /* for each active row k */ 131078910aadSHong Zhang /* initialize lnk by the column indices of row rip[k] of A */ 131178910aadSHong Zhang nzk = 0; 131278910aadSHong Zhang ncols = ai[rip[k]+1] - ai[rip[k]]; 131378910aadSHong Zhang ncols_upper = 0; 131478910aadSHong Zhang for (j=0; j<ncols; j++){ 131578910aadSHong Zhang i = rip[*(aj + ai[rip[k]] + j)]; 131678910aadSHong Zhang if (i >= k){ /* only take upper triangular entry */ 131778910aadSHong Zhang cols[ncols_upper] = i; 131878910aadSHong Zhang ncols_upper++; 131978910aadSHong Zhang } 132078910aadSHong Zhang } 132178910aadSHong Zhang ierr = PetscLLAdd(ncols_upper,cols,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr); 132278910aadSHong Zhang nzk += nlnk; 132378910aadSHong Zhang 132478910aadSHong Zhang /* update lnk by computing fill-in for each pivot row to be merged in */ 132578910aadSHong Zhang prow = jl[k]; /* 1st pivot row */ 132678910aadSHong Zhang 132778910aadSHong Zhang while (prow < k){ 132878910aadSHong Zhang nextprow = jl[prow]; 132978910aadSHong Zhang /* merge prow into k-th row */ 133078910aadSHong Zhang jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */ 133178910aadSHong Zhang jmax = ui[prow+1]; 133278910aadSHong Zhang ncols = jmax-jmin; 133378910aadSHong Zhang uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */ 13345a8e39fbSHong Zhang ierr = PetscLLAddSorted(ncols,uj_ptr,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr); 133578910aadSHong Zhang nzk += nlnk; 133678910aadSHong Zhang 133778910aadSHong Zhang /* update il and jl for prow */ 133878910aadSHong Zhang if (jmin < jmax){ 133978910aadSHong Zhang il[prow] = jmin; 134078910aadSHong Zhang j = *uj_ptr; jl[prow] = jl[j]; jl[j] = prow; 134178910aadSHong Zhang } 134278910aadSHong Zhang prow = nextprow; 134378910aadSHong Zhang } 134478910aadSHong Zhang 134578910aadSHong Zhang /* if free space is not available, make more free space */ 134678910aadSHong Zhang if (current_space->local_remaining<nzk) { 134778910aadSHong Zhang i = mbs - k + 1; /* num of unfactored rows */ 134878910aadSHong Zhang i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 1349a1a86e44SBarry Smith ierr = PetscFreeSpaceGet(i,¤t_space);CHKERRQ(ierr); 135078910aadSHong Zhang reallocs++; 135178910aadSHong Zhang } 135278910aadSHong Zhang 135378910aadSHong Zhang /* copy data into free space, then initialize lnk */ 135478910aadSHong Zhang ierr = PetscLLClean(mbs,mbs,nzk,lnk,current_space->array,lnkbt);CHKERRQ(ierr); 135578910aadSHong Zhang 135678910aadSHong Zhang /* add the k-th row into il and jl */ 135778910aadSHong Zhang if (nzk-1 > 0){ 135878910aadSHong Zhang i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 135978910aadSHong Zhang jl[k] = jl[i]; jl[i] = k; 136078910aadSHong Zhang il[k] = ui[k] + 1; 136178910aadSHong Zhang } 136278910aadSHong Zhang ui_ptr[k] = current_space->array; 136378910aadSHong Zhang current_space->array += nzk; 136478910aadSHong Zhang current_space->local_used += nzk; 136578910aadSHong Zhang current_space->local_remaining -= nzk; 136678910aadSHong Zhang 136778910aadSHong Zhang ui[k+1] = ui[k] + nzk; 136878910aadSHong Zhang } 136978910aadSHong Zhang 13706cf91177SBarry Smith #if defined(PETSC_USE_INFO) 137178910aadSHong Zhang if (ai[mbs] != 0) { 137278910aadSHong Zhang PetscReal af = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]); 1373ae15b995SBarry Smith ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr); 1374ae15b995SBarry Smith ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 1375ae15b995SBarry Smith ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr); 137678910aadSHong Zhang } else { 1377ae15b995SBarry Smith ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr); 137878910aadSHong Zhang } 137963ba0a88SBarry Smith #endif 138078910aadSHong Zhang 138178910aadSHong Zhang ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr); 138278910aadSHong Zhang ierr = PetscFree(jl);CHKERRQ(ierr); 138378910aadSHong Zhang 138478910aadSHong Zhang /* destroy list of free space and other temporary array(s) */ 138578910aadSHong Zhang ierr = PetscMalloc((ui[mbs]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr); 1386a1a86e44SBarry Smith ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr); 138778910aadSHong Zhang ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 138878910aadSHong Zhang 138978910aadSHong Zhang /* put together the new matrix in MATSEQSBAIJ format */ 1390719d5645SBarry Smith B = fact; 1391ab93d7beSBarry Smith ierr = MatSeqSBAIJSetPreallocation(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 139278910aadSHong Zhang 139378910aadSHong Zhang b = (Mat_SeqSBAIJ*)B->data; 139478910aadSHong Zhang b->singlemalloc = PETSC_FALSE; 1395e6b907acSBarry Smith b->free_a = PETSC_TRUE; 1396e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 139778910aadSHong Zhang ierr = PetscMalloc((ui[mbs]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 139878910aadSHong Zhang b->j = uj; 139978910aadSHong Zhang b->i = ui; 140078910aadSHong Zhang b->diag = 0; 140178910aadSHong Zhang b->ilen = 0; 140278910aadSHong Zhang b->imax = 0; 140378910aadSHong Zhang b->row = perm; 140478910aadSHong Zhang b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 140578910aadSHong Zhang ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 140678910aadSHong Zhang b->icol = perm; 140778910aadSHong Zhang ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr); 140878910aadSHong Zhang ierr = PetscMalloc((mbs+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 140952e6d16bSBarry Smith ierr = PetscLogObjectMemory(B,(ui[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 141078910aadSHong Zhang b->maxnz = b->nz = ui[mbs]; 141178910aadSHong Zhang 141278910aadSHong Zhang B->info.factor_mallocs = reallocs; 141378910aadSHong Zhang B->info.fill_ratio_given = fill; 141478910aadSHong Zhang if (ai[mbs] != 0) { 141578910aadSHong Zhang B->info.fill_ratio_needed = ((PetscReal)ui[mbs])/((PetscReal)ai[mbs]); 141678910aadSHong Zhang } else { 141778910aadSHong Zhang B->info.fill_ratio_needed = 0.0; 141878910aadSHong Zhang } 141978910aadSHong Zhang if (perm_identity){ 14206ad2eaddSHong Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 142178910aadSHong Zhang } else { 14226ad2eaddSHong Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 142378910aadSHong Zhang } 1424c05c3958SHong Zhang PetscFunctionReturn(0); 1425c05c3958SHong Zhang } 1426c8342467SHong Zhang 14272efa7f71SHong Zhang /* --------------------------------------------------------- */ 14282efa7f71SHong Zhang #undef __FUNCT__ 142984a281e5SHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct" 143084a281e5SHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 14312efa7f71SHong Zhang { 14322efa7f71SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 14332efa7f71SHong Zhang PetscErrorCode ierr; 14342efa7f71SHong Zhang const PetscInt *ai=a->i,*aj=a->j,*vi; 14356464896eSShri Abhyankar PetscInt i,k,n=a->mbs; 14362efa7f71SHong Zhang PetscInt nz,bs=A->rmap->bs,bs2=a->bs2,*adiag=a->diag; 14372efa7f71SHong Zhang MatScalar *aa=a->a,*v; 14382efa7f71SHong Zhang PetscScalar *x,*b,*s,*t,*ls; 14392efa7f71SHong Zhang 14402efa7f71SHong Zhang PetscFunctionBegin; 14416bce7ff8SHong Zhang /* printf("MatSolve_SeqBAIJ_NaturalOrdering_iludt..bs %d\n",bs); */ 14422efa7f71SHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 14432efa7f71SHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 14442efa7f71SHong Zhang t = a->solve_work; 14452efa7f71SHong Zhang 14462efa7f71SHong Zhang /* forward solve the lower triangular */ 14476da515a1SHong Zhang ierr = PetscMemcpy(t,b,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy 1st block of b to t */ 14486bce7ff8SHong Zhang 14492efa7f71SHong Zhang for (i=1; i<n; i++) { 14502efa7f71SHong Zhang v = aa + bs2*ai[i]; 14512efa7f71SHong Zhang vi = aj + ai[i]; 14522efa7f71SHong Zhang nz = ai[i+1] - ai[i]; 14532efa7f71SHong Zhang s = t + bs*i; 14546da515a1SHong Zhang ierr = PetscMemcpy(s,b+bs*i,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy i_th block of b to t */ 14556464896eSShri Abhyankar for(k=0;k<nz;k++){ 14566464896eSShri Abhyankar Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[k]); 14572efa7f71SHong Zhang v += bs2; 14582efa7f71SHong Zhang } 14592efa7f71SHong Zhang } 14602efa7f71SHong Zhang 14612efa7f71SHong Zhang /* backward solve the upper triangular */ 14622efa7f71SHong Zhang ls = a->solve_work + A->cmap->n; 14632efa7f71SHong Zhang for (i=n-1; i>=0; i--){ 14646bce7ff8SHong Zhang v = aa + bs2*ai[2*n-i]; 14656bce7ff8SHong Zhang vi = aj + ai[2*n-i]; 14666bce7ff8SHong Zhang nz = ai[2*n-i +1] - ai[2*n-i]-1; 14672efa7f71SHong Zhang ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 14686464896eSShri Abhyankar for(k=0;k<nz;k++){ 14696464896eSShri Abhyankar Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[k]); 14702efa7f71SHong Zhang v += bs2; 14712efa7f71SHong Zhang } 14722efa7f71SHong Zhang Kernel_w_gets_A_times_v(bs,ls,aa+bs2*adiag[i],t+i*bs); /* *inv(diagonal[i]) */ 14732efa7f71SHong Zhang ierr = PetscMemcpy(x+i*bs,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 14742efa7f71SHong Zhang } 14756bce7ff8SHong Zhang 14762efa7f71SHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 14772efa7f71SHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 14782efa7f71SHong Zhang ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 14792efa7f71SHong Zhang PetscFunctionReturn(0); 14802efa7f71SHong Zhang } 14812efa7f71SHong Zhang 14822efa7f71SHong Zhang #undef __FUNCT__ 14831a83e813SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2" 14841a83e813SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx) 14851a83e813SShri Abhyankar { 14861a83e813SShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 14871a83e813SShri Abhyankar PetscErrorCode ierr; 14881a83e813SShri Abhyankar const PetscInt *ai=a->i,*aj=a->j,*adiag=a->diag,*vi; 14891a83e813SShri Abhyankar PetscInt i,k,n=a->mbs; 14901a83e813SShri Abhyankar PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 14911a83e813SShri Abhyankar MatScalar *aa=a->a,*v; 14921a83e813SShri Abhyankar PetscScalar *x,*b,*s,*t,*ls; 14931a83e813SShri Abhyankar 14941a83e813SShri Abhyankar PetscFunctionBegin; 14951a83e813SShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 14961a83e813SShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 14971a83e813SShri Abhyankar t = a->solve_work; 14981a83e813SShri Abhyankar 14991a83e813SShri Abhyankar /* forward solve the lower triangular */ 15001a83e813SShri Abhyankar ierr = PetscMemcpy(t,b,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy 1st block of b to t */ 15011a83e813SShri Abhyankar 15021a83e813SShri Abhyankar for (i=1; i<n; i++) { 15031a83e813SShri Abhyankar v = aa + bs2*ai[i]; 15041a83e813SShri Abhyankar vi = aj + ai[i]; 15051a83e813SShri Abhyankar nz = ai[i+1] - ai[i]; 15061a83e813SShri Abhyankar s = t + bs*i; 15071a83e813SShri Abhyankar ierr = PetscMemcpy(s,b+bs*i,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* copy i_th block of b to t */ 15081a83e813SShri Abhyankar for(k=0;k<nz;k++){ 15091a83e813SShri Abhyankar Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[k]); 15101a83e813SShri Abhyankar v += bs2; 15111a83e813SShri Abhyankar } 15121a83e813SShri Abhyankar } 15131a83e813SShri Abhyankar 15141a83e813SShri Abhyankar /* backward solve the upper triangular */ 15151a83e813SShri Abhyankar ls = a->solve_work + A->cmap->n; 15161a83e813SShri Abhyankar for (i=n-1; i>=0; i--){ 15171a83e813SShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 15181a83e813SShri Abhyankar vi = aj + adiag[i+1]+1; 15191a83e813SShri Abhyankar nz = adiag[i] - adiag[i+1]-1; 15201a83e813SShri Abhyankar ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 15211a83e813SShri Abhyankar for(k=0;k<nz;k++){ 15221a83e813SShri Abhyankar Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[k]); 15231a83e813SShri Abhyankar v += bs2; 15241a83e813SShri Abhyankar } 15251a83e813SShri Abhyankar Kernel_w_gets_A_times_v(bs,ls,aa+bs2*adiag[i],t+i*bs); /* *inv(diagonal[i]) */ 15261a83e813SShri Abhyankar ierr = PetscMemcpy(x+i*bs,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 15271a83e813SShri Abhyankar } 15281a83e813SShri Abhyankar 15291a83e813SShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 15301a83e813SShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 15311a83e813SShri Abhyankar ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 15321a83e813SShri Abhyankar PetscFunctionReturn(0); 15331a83e813SShri Abhyankar } 15341a83e813SShri Abhyankar 15351a83e813SShri Abhyankar #undef __FUNCT__ 153684a281e5SHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_newdatastruct" 153784a281e5SHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_newdatastruct(Mat A,Vec bb,Vec xx) 15382efa7f71SHong Zhang { 15392efa7f71SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 15402efa7f71SHong Zhang IS iscol=a->col,isrow=a->row; 15412efa7f71SHong Zhang PetscErrorCode ierr; 15422efa7f71SHong Zhang const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 154329b92fc1SShri Abhyankar PetscInt i,m,n=a->mbs; 15448f690400SShri Abhyankar PetscInt nz,bs=A->rmap->bs,bs2=a->bs2,k; 15452efa7f71SHong Zhang MatScalar *aa=a->a,*v; 15462efa7f71SHong Zhang PetscScalar *x,*b,*s,*t,*ls; 15472efa7f71SHong Zhang 15482efa7f71SHong Zhang PetscFunctionBegin; 15492efa7f71SHong Zhang ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 15502efa7f71SHong Zhang ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 15512efa7f71SHong Zhang t = a->solve_work; 15522efa7f71SHong Zhang 15532efa7f71SHong Zhang ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 155429b92fc1SShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 15552efa7f71SHong Zhang 15562efa7f71SHong Zhang /* forward solve the lower triangular */ 155729b92fc1SShri Abhyankar ierr = PetscMemcpy(t,b+bs*r[0],bs*sizeof(PetscScalar));CHKERRQ(ierr); 15582efa7f71SHong Zhang for (i=1; i<n; i++) { 15592efa7f71SHong Zhang v = aa + bs2*ai[i]; 15602efa7f71SHong Zhang vi = aj + ai[i]; 15612efa7f71SHong Zhang nz = ai[i+1] - ai[i]; 15622efa7f71SHong Zhang s = t + bs*i; 156329b92fc1SShri Abhyankar ierr = PetscMemcpy(s,b+bs*r[i],bs*sizeof(PetscScalar));CHKERRQ(ierr); 156429b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 156529b92fc1SShri Abhyankar Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[m]); 15662efa7f71SHong Zhang v += bs2; 15672efa7f71SHong Zhang } 15682efa7f71SHong Zhang } 15692efa7f71SHong Zhang 15702efa7f71SHong Zhang /* backward solve the upper triangular */ 15712efa7f71SHong Zhang ls = a->solve_work + A->cmap->n; 15722efa7f71SHong Zhang for (i=n-1; i>=0; i--){ 15738f690400SShri Abhyankar k = 2*n-i; 15748f690400SShri Abhyankar v = aa + bs2*ai[k]; 15758f690400SShri Abhyankar vi = aj + ai[k]; 15768f690400SShri Abhyankar nz = ai[k+1] - ai[k] - 1; 15772efa7f71SHong Zhang ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 157829b92fc1SShri Abhyankar for(m=0;m<nz;m++){ 157929b92fc1SShri Abhyankar Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[m]); 15802efa7f71SHong Zhang v += bs2; 15812efa7f71SHong Zhang } 15828f690400SShri Abhyankar Kernel_w_gets_A_times_v(bs,ls,v,t+i*bs); /* *inv(diagonal[i]) */ 158329b92fc1SShri Abhyankar ierr = PetscMemcpy(x + bs*c[i],t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 15842efa7f71SHong Zhang } 15852efa7f71SHong Zhang ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 15862efa7f71SHong Zhang ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 15872efa7f71SHong Zhang ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 15882efa7f71SHong Zhang ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 15892efa7f71SHong Zhang ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 15902efa7f71SHong Zhang PetscFunctionReturn(0); 15912efa7f71SHong Zhang } 15922efa7f71SHong Zhang 15932efa7f71SHong Zhang #undef __FUNCT__ 1594*35aa4fcfSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_N_newdatastruct_v2" 1595*35aa4fcfSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_N_newdatastruct_v2(Mat A,Vec bb,Vec xx) 1596*35aa4fcfSShri Abhyankar { 1597*35aa4fcfSShri Abhyankar Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1598*35aa4fcfSShri Abhyankar IS iscol=a->col,isrow=a->row; 1599*35aa4fcfSShri Abhyankar PetscErrorCode ierr; 1600*35aa4fcfSShri Abhyankar const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi; 1601*35aa4fcfSShri Abhyankar PetscInt i,m,n=a->mbs; 1602*35aa4fcfSShri Abhyankar PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1603*35aa4fcfSShri Abhyankar MatScalar *aa=a->a,*v; 1604*35aa4fcfSShri Abhyankar PetscScalar *x,*b,*s,*t,*ls; 1605*35aa4fcfSShri Abhyankar 1606*35aa4fcfSShri Abhyankar PetscFunctionBegin; 1607*35aa4fcfSShri Abhyankar ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1608*35aa4fcfSShri Abhyankar ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1609*35aa4fcfSShri Abhyankar t = a->solve_work; 1610*35aa4fcfSShri Abhyankar 1611*35aa4fcfSShri Abhyankar ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1612*35aa4fcfSShri Abhyankar ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1613*35aa4fcfSShri Abhyankar 1614*35aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 1615*35aa4fcfSShri Abhyankar ierr = PetscMemcpy(t,b+bs*r[0],bs*sizeof(PetscScalar));CHKERRQ(ierr); 1616*35aa4fcfSShri Abhyankar for (i=1; i<n; i++) { 1617*35aa4fcfSShri Abhyankar v = aa + bs2*ai[i]; 1618*35aa4fcfSShri Abhyankar vi = aj + ai[i]; 1619*35aa4fcfSShri Abhyankar nz = ai[i+1] - ai[i]; 1620*35aa4fcfSShri Abhyankar s = t + bs*i; 1621*35aa4fcfSShri Abhyankar ierr = PetscMemcpy(s,b+bs*r[i],bs*sizeof(PetscScalar));CHKERRQ(ierr); 1622*35aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 1623*35aa4fcfSShri Abhyankar Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*vi[m]); 1624*35aa4fcfSShri Abhyankar v += bs2; 1625*35aa4fcfSShri Abhyankar } 1626*35aa4fcfSShri Abhyankar } 1627*35aa4fcfSShri Abhyankar 1628*35aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 1629*35aa4fcfSShri Abhyankar ls = a->solve_work + A->cmap->n; 1630*35aa4fcfSShri Abhyankar for (i=n-1; i>=0; i--){ 1631*35aa4fcfSShri Abhyankar v = aa + bs2*(adiag[i+1]+1); 1632*35aa4fcfSShri Abhyankar vi = aj + adiag[i+1]+1; 1633*35aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i+1] - 1; 1634*35aa4fcfSShri Abhyankar ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1635*35aa4fcfSShri Abhyankar for(m=0;m<nz;m++){ 1636*35aa4fcfSShri Abhyankar Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*vi[m]); 1637*35aa4fcfSShri Abhyankar v += bs2; 1638*35aa4fcfSShri Abhyankar } 1639*35aa4fcfSShri Abhyankar Kernel_w_gets_A_times_v(bs,ls,v,t+i*bs); /* *inv(diagonal[i]) */ 1640*35aa4fcfSShri Abhyankar ierr = PetscMemcpy(x + bs*c[i],t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1641*35aa4fcfSShri Abhyankar } 1642*35aa4fcfSShri Abhyankar ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1643*35aa4fcfSShri Abhyankar ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1644*35aa4fcfSShri Abhyankar ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1645*35aa4fcfSShri Abhyankar ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1646*35aa4fcfSShri Abhyankar ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1647*35aa4fcfSShri Abhyankar PetscFunctionReturn(0); 1648*35aa4fcfSShri Abhyankar } 1649*35aa4fcfSShri Abhyankar 1650*35aa4fcfSShri Abhyankar #undef __FUNCT__ 16512efa7f71SHong Zhang #define __FUNCT__ "BlockAbs_privat" 165291d91c02SMatthew Knepley PetscErrorCode BlockAbs_private(PetscInt nbs,PetscInt bs2,PetscScalar *blockarray,PetscReal *absarray) 16532efa7f71SHong Zhang { 16542efa7f71SHong Zhang PetscErrorCode ierr; 16552efa7f71SHong Zhang PetscInt i,j; 16562efa7f71SHong Zhang PetscFunctionBegin; 16572efa7f71SHong Zhang ierr = PetscMemzero(absarray,(nbs+1)*sizeof(PetscReal));CHKERRQ(ierr); 16582efa7f71SHong Zhang for (i=0; i<nbs; i++){ 16592efa7f71SHong Zhang for (j=0; j<bs2; j++){ 16602efa7f71SHong Zhang if (absarray[i] < PetscAbsScalar(blockarray[i*nbs+j])) absarray[i] = PetscAbsScalar(blockarray[i*nbs+j]); 16612efa7f71SHong Zhang } 16622efa7f71SHong Zhang } 16632efa7f71SHong Zhang PetscFunctionReturn(0); 16642efa7f71SHong Zhang } 16652efa7f71SHong Zhang 1666c8342467SHong Zhang #undef __FUNCT__ 1667c8342467SHong Zhang #define __FUNCT__ "MatILUDTFactor_SeqBAIJ" 1668c8342467SHong Zhang PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A,IS isrow,IS iscol,const MatFactorInfo *info,Mat *fact) 1669c8342467SHong Zhang { 16702efa7f71SHong Zhang Mat B = *fact; 16712efa7f71SHong Zhang Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b; 16722efa7f71SHong Zhang IS isicol; 16732efa7f71SHong Zhang PetscErrorCode ierr; 16742efa7f71SHong Zhang const PetscInt *r,*ic; 16752efa7f71SHong Zhang PetscInt i,mbs=a->mbs,bs=A->rmap->bs,bs2=a->bs2,*ai=a->i,*aj=a->j,*ajtmp,*adiag; 16762efa7f71SHong Zhang PetscInt *bi,*bj,*bdiag; 16772efa7f71SHong Zhang 16782efa7f71SHong Zhang PetscInt row,nzi,nzi_bl,nzi_bu,*im,dtcount,nzi_al,nzi_au; 16792efa7f71SHong Zhang PetscInt nlnk,*lnk; 16802efa7f71SHong Zhang PetscBT lnkbt; 16812efa7f71SHong Zhang PetscTruth row_identity,icol_identity,both_identity; 16822efa7f71SHong Zhang MatScalar *aatmp,*pv,*batmp,*ba,*rtmp,*pc,*multiplier,*vtmp; 16832efa7f71SHong Zhang const PetscInt *ics; 16842efa7f71SHong Zhang PetscInt j,nz,*pj,*bjtmp,k,ncut,*jtmp; 16852efa7f71SHong Zhang 16866da515a1SHong Zhang PetscReal dt=info->dt; /* shift=info->shiftinblocks; */ 16872efa7f71SHong Zhang PetscInt nnz_max; 16882efa7f71SHong Zhang PetscTruth missing; 1689021822bcSHong Zhang PetscReal *vtmp_abs; 169097ef94ebSSatish Balay MatScalar *v_work; 169197ef94ebSSatish Balay PetscInt *v_pivots; 16922efa7f71SHong Zhang 1693c8342467SHong Zhang PetscFunctionBegin; 16942efa7f71SHong Zhang /* ------- symbolic factorization, can be reused ---------*/ 16952efa7f71SHong Zhang ierr = MatMissingDiagonal(A,&missing,&i);CHKERRQ(ierr); 16962efa7f71SHong Zhang if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",i); 16972efa7f71SHong Zhang adiag=a->diag; 16982efa7f71SHong Zhang 16992efa7f71SHong Zhang ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 17002efa7f71SHong Zhang 17012efa7f71SHong Zhang /* bdiag is location of diagonal in factor */ 17022efa7f71SHong Zhang ierr = PetscMalloc((mbs+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 17032efa7f71SHong Zhang 17042efa7f71SHong Zhang /* allocate row pointers bi */ 17056bce7ff8SHong Zhang ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 17062efa7f71SHong Zhang 17072efa7f71SHong Zhang /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */ 17082efa7f71SHong Zhang dtcount = (PetscInt)info->dtcount; 17096bce7ff8SHong Zhang if (dtcount > mbs-1) dtcount = mbs-1; 17106bce7ff8SHong Zhang nnz_max = ai[mbs]+2*mbs*dtcount +2; 17116da515a1SHong Zhang /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */ 17122efa7f71SHong Zhang ierr = PetscMalloc(nnz_max*sizeof(PetscInt),&bj);CHKERRQ(ierr); 17136bce7ff8SHong Zhang nnz_max = nnz_max*bs2; 17142efa7f71SHong Zhang ierr = PetscMalloc(nnz_max*sizeof(MatScalar),&ba);CHKERRQ(ierr); 17152efa7f71SHong Zhang 17162efa7f71SHong Zhang /* put together the new matrix */ 17172efa7f71SHong Zhang ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(B,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 17182efa7f71SHong Zhang ierr = PetscLogObjectParent(B,isicol);CHKERRQ(ierr); 17192efa7f71SHong Zhang b = (Mat_SeqBAIJ*)(B)->data; 17202efa7f71SHong Zhang b->free_a = PETSC_TRUE; 17212efa7f71SHong Zhang b->free_ij = PETSC_TRUE; 17222efa7f71SHong Zhang b->singlemalloc = PETSC_FALSE; 17232efa7f71SHong Zhang b->a = ba; 17242efa7f71SHong Zhang b->j = bj; 17252efa7f71SHong Zhang b->i = bi; 17262efa7f71SHong Zhang b->diag = bdiag; 17272efa7f71SHong Zhang b->ilen = 0; 17282efa7f71SHong Zhang b->imax = 0; 17292efa7f71SHong Zhang b->row = isrow; 17302efa7f71SHong Zhang b->col = iscol; 17312efa7f71SHong Zhang ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 17322efa7f71SHong Zhang ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 17332efa7f71SHong Zhang b->icol = isicol; 17342efa7f71SHong Zhang ierr = PetscMalloc((bs*(mbs+1))*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 17352efa7f71SHong Zhang 17362efa7f71SHong Zhang ierr = PetscLogObjectMemory(B,nnz_max*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr); 17372efa7f71SHong Zhang b->maxnz = nnz_max/bs2; 17382efa7f71SHong Zhang 17392efa7f71SHong Zhang (B)->factor = MAT_FACTOR_ILUDT; 17402efa7f71SHong Zhang (B)->info.factor_mallocs = 0; 17412efa7f71SHong Zhang (B)->info.fill_ratio_given = ((PetscReal)nnz_max)/((PetscReal)(ai[mbs]*bs2)); 17422efa7f71SHong Zhang CHKMEMQ; 17432efa7f71SHong Zhang /* ------- end of symbolic factorization ---------*/ 17442efa7f71SHong Zhang ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 17452efa7f71SHong Zhang ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 17462efa7f71SHong Zhang ics = ic; 17472efa7f71SHong Zhang 17482efa7f71SHong Zhang /* linked list for storing column indices of the active row */ 17492efa7f71SHong Zhang nlnk = mbs + 1; 17502efa7f71SHong Zhang ierr = PetscLLCreate(mbs,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr); 17512efa7f71SHong Zhang 17522efa7f71SHong Zhang /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */ 17532efa7f71SHong Zhang ierr = PetscMalloc((2*mbs+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 17542efa7f71SHong Zhang jtmp = im + mbs; 17552efa7f71SHong Zhang /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */ 17562efa7f71SHong Zhang ierr = PetscMalloc((2*mbs*bs2+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 17572efa7f71SHong Zhang vtmp = rtmp + bs2*mbs; 1758021822bcSHong Zhang ierr = PetscMalloc((mbs+1)*sizeof(PetscReal),&vtmp_abs);CHKERRQ(ierr); 17592efa7f71SHong Zhang 17602efa7f71SHong Zhang ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 17612efa7f71SHong Zhang multiplier = v_work + bs; 17622efa7f71SHong Zhang v_pivots = (PetscInt*)(multiplier + bs2); 17632efa7f71SHong Zhang 17642efa7f71SHong Zhang bi[0] = 0; 17652efa7f71SHong Zhang bdiag[0] = (nnz_max/bs2)-1; /* location of diagonal in factor B */ 17666bce7ff8SHong Zhang bi[2*mbs+1] = bdiag[0]+1; /* endof bj and ba array */ 17672efa7f71SHong Zhang for (i=0; i<mbs; i++) { 17682efa7f71SHong Zhang /* copy initial fill into linked list */ 17692efa7f71SHong Zhang nzi = 0; /* nonzeros for active row i */ 17702efa7f71SHong Zhang nzi = ai[r[i]+1] - ai[r[i]]; 17712efa7f71SHong Zhang if (!nzi) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 17722efa7f71SHong Zhang nzi_al = adiag[r[i]] - ai[r[i]]; 17732efa7f71SHong Zhang nzi_au = ai[r[i]+1] - adiag[r[i]] -1; 17746da515a1SHong Zhang /* printf("row %d, nzi_al/au %d %d\n",i,nzi_al,nzi_au); */ 17752efa7f71SHong Zhang 17762efa7f71SHong Zhang /* load in initial unfactored row */ 17772efa7f71SHong Zhang ajtmp = aj + ai[r[i]]; 17782efa7f71SHong Zhang ierr = PetscLLAddPerm(nzi,ajtmp,ic,mbs,nlnk,lnk,lnkbt);CHKERRQ(ierr); 17792efa7f71SHong Zhang ierr = PetscMemzero(rtmp,(mbs*bs2+1)*sizeof(PetscScalar));CHKERRQ(ierr); 17802efa7f71SHong Zhang aatmp = a->a + bs2*ai[r[i]]; 17812efa7f71SHong Zhang for (j=0; j<nzi; j++) { 17822efa7f71SHong Zhang ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],aatmp+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 17832efa7f71SHong Zhang } 17842efa7f71SHong Zhang 17852efa7f71SHong Zhang /* add pivot rows into linked list */ 17862efa7f71SHong Zhang row = lnk[mbs]; 17872efa7f71SHong Zhang while (row < i) { 17882efa7f71SHong Zhang nzi_bl = bi[row+1] - bi[row] + 1; 17892efa7f71SHong Zhang bjtmp = bj + bdiag[row+1]+1; /* points to 1st column next to the diagonal in U */ 17902efa7f71SHong Zhang ierr = PetscLLAddSortedLU(bjtmp,row,nlnk,lnk,lnkbt,i,nzi_bl,im);CHKERRQ(ierr); 17912efa7f71SHong Zhang nzi += nlnk; 17922efa7f71SHong Zhang row = lnk[row]; 17932efa7f71SHong Zhang } 17942efa7f71SHong Zhang 17952efa7f71SHong Zhang /* copy data from lnk into jtmp, then initialize lnk */ 17962efa7f71SHong Zhang ierr = PetscLLClean(mbs,mbs,nzi,lnk,jtmp,lnkbt);CHKERRQ(ierr); 17972efa7f71SHong Zhang 17982efa7f71SHong Zhang /* numerical factorization */ 17992efa7f71SHong Zhang bjtmp = jtmp; 18002efa7f71SHong Zhang row = *bjtmp++; /* 1st pivot row */ 18012efa7f71SHong Zhang 18022efa7f71SHong Zhang while (row < i) { 18032efa7f71SHong Zhang pc = rtmp + bs2*row; 18042efa7f71SHong Zhang pv = ba + bs2*bdiag[row]; /* inv(diag) of the pivot row */ 18052efa7f71SHong Zhang Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* pc= multiplier = pc*inv(diag[row]) */ 18062efa7f71SHong Zhang ierr = BlockAbs_private(1,bs2,pc,vtmp_abs);CHKERRQ(ierr); 18072efa7f71SHong Zhang if (vtmp_abs[0] > dt){ /* apply tolerance dropping rule */ 18082efa7f71SHong Zhang pj = bj + bdiag[row+1] + 1; /* point to 1st entry of U(row,:) */ 18092efa7f71SHong Zhang pv = ba + bs2*(bdiag[row+1] + 1); 18102efa7f71SHong Zhang nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries in U(row,:), excluding diagonal */ 18112efa7f71SHong Zhang for (j=0; j<nz; j++){ 18122efa7f71SHong Zhang Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 18132efa7f71SHong Zhang } 18146da515a1SHong Zhang /* ierr = PetscLogFlops(bslog*(nz+1.0)-bs);CHKERRQ(ierr); */ 18152efa7f71SHong Zhang } 18162efa7f71SHong Zhang row = *bjtmp++; 18172efa7f71SHong Zhang } 18182efa7f71SHong Zhang 18192efa7f71SHong Zhang /* copy sparse rtmp into contiguous vtmp; separate L and U part */ 18202efa7f71SHong Zhang nzi_bl = 0; j = 0; 18212efa7f71SHong Zhang while (jtmp[j] < i){ /* L-part. Note: jtmp is sorted */ 18222efa7f71SHong Zhang ierr = PetscMemcpy(vtmp+bs2*j,rtmp+bs2*jtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 18232efa7f71SHong Zhang nzi_bl++; j++; 18242efa7f71SHong Zhang } 18252efa7f71SHong Zhang nzi_bu = nzi - nzi_bl -1; 18266da515a1SHong Zhang /* printf("nzi %d, nzi_bl %d, nzi_bu %d\n",nzi,nzi_bl,nzi_bu); */ 18272efa7f71SHong Zhang 18282efa7f71SHong Zhang while (j < nzi){ /* U-part */ 18292efa7f71SHong Zhang ierr = PetscMemcpy(vtmp+bs2*j,rtmp+bs2*jtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 18302efa7f71SHong Zhang /* 18312efa7f71SHong Zhang printf(" col %d: ",jtmp[j]); 18322efa7f71SHong Zhang for (j1=0; j1<bs2; j1++) printf(" %g",*(vtmp+bs2*j+j1)); 18332efa7f71SHong Zhang printf(" \n"); 18342efa7f71SHong Zhang */ 18352efa7f71SHong Zhang j++; 18362efa7f71SHong Zhang } 18372efa7f71SHong Zhang 18382efa7f71SHong Zhang ierr = BlockAbs_private(nzi,bs2,vtmp,vtmp_abs);CHKERRQ(ierr); 18392efa7f71SHong Zhang /* 18402efa7f71SHong Zhang printf(" row %d, nzi %d, vtmp_abs\n",i,nzi); 18412efa7f71SHong Zhang for (j1=0; j1<nzi; j1++) printf(" (%d %g),",jtmp[j1],vtmp_abs[j1]); 18422efa7f71SHong Zhang printf(" \n"); 18432efa7f71SHong Zhang */ 18442efa7f71SHong Zhang bjtmp = bj + bi[i]; 18452efa7f71SHong Zhang batmp = ba + bs2*bi[i]; 18462efa7f71SHong Zhang /* apply level dropping rule to L part */ 18472efa7f71SHong Zhang ncut = nzi_al + dtcount; 18482efa7f71SHong Zhang if (ncut < nzi_bl){ 1849021822bcSHong Zhang ierr = PetscSortSplitReal(ncut,nzi_bl,vtmp_abs,jtmp);CHKERRQ(ierr); 18502efa7f71SHong Zhang ierr = PetscSortIntWithScalarArray(ncut,jtmp,vtmp);CHKERRQ(ierr); 18512efa7f71SHong Zhang } else { 18522efa7f71SHong Zhang ncut = nzi_bl; 18532efa7f71SHong Zhang } 18542efa7f71SHong Zhang for (j=0; j<ncut; j++){ 18552efa7f71SHong Zhang bjtmp[j] = jtmp[j]; 18562efa7f71SHong Zhang ierr = PetscMemcpy(batmp+bs2*j,rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 18572efa7f71SHong Zhang /* 18582efa7f71SHong Zhang printf(" col %d: ",bjtmp[j]); 18592efa7f71SHong Zhang for (j1=0; j1<bs2; j1++) printf(" %g,",*(batmp+bs2*j+j1)); 18602efa7f71SHong Zhang printf("\n"); 18612efa7f71SHong Zhang */ 18622efa7f71SHong Zhang } 18632efa7f71SHong Zhang bi[i+1] = bi[i] + ncut; 18642efa7f71SHong Zhang nzi = ncut + 1; 18652efa7f71SHong Zhang 18662efa7f71SHong Zhang /* apply level dropping rule to U part */ 18672efa7f71SHong Zhang ncut = nzi_au + dtcount; 18682efa7f71SHong Zhang if (ncut < nzi_bu){ 1869021822bcSHong Zhang ierr = PetscSortSplitReal(ncut,nzi_bu,vtmp_abs+nzi_bl+1,jtmp+nzi_bl+1);CHKERRQ(ierr); 18702efa7f71SHong Zhang ierr = PetscSortIntWithScalarArray(ncut,jtmp+nzi_bl+1,vtmp+nzi_bl+1);CHKERRQ(ierr); 18712efa7f71SHong Zhang } else { 18722efa7f71SHong Zhang ncut = nzi_bu; 18732efa7f71SHong Zhang } 18742efa7f71SHong Zhang nzi += ncut; 18752efa7f71SHong Zhang 18762efa7f71SHong Zhang /* mark bdiagonal */ 18772efa7f71SHong Zhang bdiag[i+1] = bdiag[i] - (ncut + 1); 18786bce7ff8SHong Zhang bi[2*mbs - i] = bi[2*mbs - i +1] - (ncut + 1); 18796bce7ff8SHong Zhang 18802efa7f71SHong Zhang bjtmp = bj + bdiag[i]; 18812efa7f71SHong Zhang batmp = ba + bs2*bdiag[i]; 18822efa7f71SHong Zhang ierr = PetscMemcpy(batmp,rtmp+bs2*i,bs2*sizeof(MatScalar));CHKERRQ(ierr); 18832efa7f71SHong Zhang *bjtmp = i; 18842efa7f71SHong Zhang /* 18852efa7f71SHong Zhang printf(" diag %d: ",*bjtmp); 18862efa7f71SHong Zhang for (j=0; j<bs2; j++){ 18872efa7f71SHong Zhang printf(" %g,",batmp[j]); 18882efa7f71SHong Zhang } 18892efa7f71SHong Zhang printf("\n"); 18902efa7f71SHong Zhang */ 18912efa7f71SHong Zhang bjtmp = bj + bdiag[i+1]+1; 18922efa7f71SHong Zhang batmp = ba + (bdiag[i+1]+1)*bs2; 18932efa7f71SHong Zhang 18942efa7f71SHong Zhang for (k=0; k<ncut; k++){ 18952efa7f71SHong Zhang bjtmp[k] = jtmp[nzi_bl+1+k]; 18962efa7f71SHong Zhang ierr = PetscMemcpy(batmp+bs2*k,rtmp+bs2*bjtmp[k],bs2*sizeof(MatScalar));CHKERRQ(ierr); 18972efa7f71SHong Zhang /* 18982efa7f71SHong Zhang printf(" col %d:",bjtmp[k]); 18992efa7f71SHong Zhang for (j1=0; j1<bs2; j1++) printf(" %g,",*(batmp+bs2*k+j1)); 19002efa7f71SHong Zhang printf("\n"); 19012efa7f71SHong Zhang */ 19022efa7f71SHong Zhang } 19032efa7f71SHong Zhang 19042efa7f71SHong Zhang im[i] = nzi; /* used by PetscLLAddSortedLU() */ 19052efa7f71SHong Zhang 19062efa7f71SHong Zhang /* invert diagonal block for simplier triangular solves - add shift??? */ 19072efa7f71SHong Zhang batmp = ba + bs2*bdiag[i]; 19082efa7f71SHong Zhang ierr = Kernel_A_gets_inverse_A(bs,batmp,v_pivots,v_work);CHKERRQ(ierr); 19092efa7f71SHong Zhang } /* for (i=0; i<mbs; i++) */ 19102efa7f71SHong Zhang 19116da515a1SHong Zhang /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */ 19122efa7f71SHong Zhang if (bi[mbs] >= bdiag[mbs]) SETERRQ2(PETSC_ERR_ARG_SIZ,"end of L array %d cannot >= the beginning of U array %d",bi[mbs],bdiag[mbs]); 19132efa7f71SHong Zhang 19142efa7f71SHong Zhang ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 19152efa7f71SHong Zhang ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 19162efa7f71SHong Zhang 19172efa7f71SHong Zhang ierr = PetscLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 19182efa7f71SHong Zhang 19192efa7f71SHong Zhang ierr = PetscFree(im);CHKERRQ(ierr); 19202efa7f71SHong Zhang ierr = PetscFree(rtmp);CHKERRQ(ierr); 19212efa7f71SHong Zhang 19222efa7f71SHong Zhang ierr = PetscLogFlops(bs2*B->cmap->n);CHKERRQ(ierr); 19232efa7f71SHong Zhang b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs]; 19242efa7f71SHong Zhang 19252efa7f71SHong Zhang ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 19262efa7f71SHong Zhang ierr = ISIdentity(isicol,&icol_identity);CHKERRQ(ierr); 19272efa7f71SHong Zhang both_identity = (PetscTruth) (row_identity && icol_identity); 19282efa7f71SHong Zhang if (row_identity && icol_identity) { 192984a281e5SHong Zhang B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 19302efa7f71SHong Zhang } else { 193184a281e5SHong Zhang B->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 19322efa7f71SHong Zhang } 19332efa7f71SHong Zhang 19342efa7f71SHong Zhang B->ops->solveadd = 0; 19352efa7f71SHong Zhang B->ops->solvetranspose = 0; 19362efa7f71SHong Zhang B->ops->solvetransposeadd = 0; 19372efa7f71SHong Zhang B->ops->matsolve = 0; 19382efa7f71SHong Zhang B->assembled = PETSC_TRUE; 19392efa7f71SHong Zhang B->preallocated = PETSC_TRUE; 1940c8342467SHong Zhang PetscFunctionReturn(0); 1941c8342467SHong Zhang } 1942