15d517e7eSBarry Smith /* 2ec3a8b7bSBarry Smith Factorization code for BAIJ format. 35d517e7eSBarry Smith */ 470f55243SBarry Smith #include "src/mat/impls/baij/seq/baij.h" 54078e994SBarry Smith #include "src/inline/ilu.h" 6ec3a8b7bSBarry Smith 76d84be18SBarry Smith /* ------------------------------------------------------------*/ 86d84be18SBarry Smith /* 94eeb42bcSBarry Smith Version for when blocks are 2 by 2 104eeb42bcSBarry Smith */ 114a2ae208SSatish Balay #undef __FUNCT__ 124a2ae208SSatish Balay #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2" 134eeb42bcSBarry Smith int MatLUFactorNumeric_SeqBAIJ_2(Mat A,Mat *B) 144eeb42bcSBarry Smith { 154eeb42bcSBarry Smith Mat C = *B; 164eeb42bcSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 177cdf2dbbSSatish Balay IS isrow = b->row,isicol = b->icol; 184078e994SBarry Smith int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 192515f8d2SSatish Balay int *ajtmpold,*ajtmp,nz,row; 20329f5518SBarry Smith int *diag_offset=b->diag,idx,*ai=a->i,*aj=a->j,*pj; 21329f5518SBarry Smith MatScalar *pv,*v,*rtmp,m1,m2,m3,m4,*pc,*w,*x,x1,x2,x3,x4; 222515f8d2SSatish Balay MatScalar p1,p2,p3,p4; 233f1db9ecSBarry Smith MatScalar *ba = b->a,*aa = a->a; 244eeb42bcSBarry Smith 253a40ed3dSBarry Smith PetscFunctionBegin; 264eeb42bcSBarry Smith ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 274eeb42bcSBarry Smith ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 28b0a32e0cSBarry Smith ierr = PetscMalloc(4*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 294eeb42bcSBarry Smith 304eeb42bcSBarry Smith for (i=0; i<n; i++) { 314078e994SBarry Smith nz = bi[i+1] - bi[i]; 324078e994SBarry Smith ajtmp = bj + bi[i]; 334eeb42bcSBarry Smith for (j=0; j<nz; j++) { 344eeb42bcSBarry Smith x = rtmp+4*ajtmp[j]; x[0] = x[1] = x[2] = x[3] = 0.0; 354eeb42bcSBarry Smith } 364eeb42bcSBarry Smith /* load in initial (unfactored row) */ 374eeb42bcSBarry Smith idx = r[i]; 384078e994SBarry Smith nz = ai[idx+1] - ai[idx]; 394078e994SBarry Smith ajtmpold = aj + ai[idx]; 404078e994SBarry Smith v = aa + 4*ai[idx]; 414eeb42bcSBarry Smith for (j=0; j<nz; j++) { 424eeb42bcSBarry Smith x = rtmp+4*ic[ajtmpold[j]]; 434eeb42bcSBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 444eeb42bcSBarry Smith v += 4; 454eeb42bcSBarry Smith } 464eeb42bcSBarry Smith row = *ajtmp++; 474eeb42bcSBarry Smith while (row < i) { 484eeb42bcSBarry Smith pc = rtmp + 4*row; 494eeb42bcSBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 5088685aaeSLois Curfman McInnes if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) { 514078e994SBarry Smith pv = ba + 4*diag_offset[row]; 524078e994SBarry Smith pj = bj + diag_offset[row] + 1; 534eeb42bcSBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 544eeb42bcSBarry Smith pc[0] = m1 = p1*x1 + p3*x2; 554eeb42bcSBarry Smith pc[1] = m2 = p2*x1 + p4*x2; 564eeb42bcSBarry Smith pc[2] = m3 = p1*x3 + p3*x4; 574eeb42bcSBarry Smith pc[3] = m4 = p2*x3 + p4*x4; 584078e994SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 594eeb42bcSBarry Smith pv += 4; 604eeb42bcSBarry Smith for (j=0; j<nz; j++) { 614eeb42bcSBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 624eeb42bcSBarry Smith x = rtmp + 4*pj[j]; 634eeb42bcSBarry Smith x[0] -= m1*x1 + m3*x2; 644eeb42bcSBarry Smith x[1] -= m2*x1 + m4*x2; 654eeb42bcSBarry Smith x[2] -= m1*x3 + m3*x4; 664eeb42bcSBarry Smith x[3] -= m2*x3 + m4*x4; 674eeb42bcSBarry Smith pv += 4; 684eeb42bcSBarry Smith } 69b0a32e0cSBarry Smith PetscLogFlops(16*nz+12); 704eeb42bcSBarry Smith } 714eeb42bcSBarry Smith row = *ajtmp++; 724eeb42bcSBarry Smith } 734eeb42bcSBarry Smith /* finished row so stick it into b->a */ 744078e994SBarry Smith pv = ba + 4*bi[i]; 754078e994SBarry Smith pj = bj + bi[i]; 764078e994SBarry Smith nz = bi[i+1] - bi[i]; 774eeb42bcSBarry Smith for (j=0; j<nz; j++) { 784eeb42bcSBarry Smith x = rtmp+4*pj[j]; 794eeb42bcSBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 804eeb42bcSBarry Smith pv += 4; 814eeb42bcSBarry Smith } 824eeb42bcSBarry Smith /* invert diagonal block */ 834078e994SBarry Smith w = ba + 4*diag_offset[i]; 844cd38560SBarry Smith ierr = Kernel_A_gets_inverse_A_2(w);CHKERRQ(ierr); 854eeb42bcSBarry Smith } 864eeb42bcSBarry Smith 87606d414cSSatish Balay ierr = PetscFree(rtmp);CHKERRQ(ierr); 884eeb42bcSBarry Smith ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 894eeb42bcSBarry Smith ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 904eeb42bcSBarry Smith C->factor = FACTOR_LU; 914eeb42bcSBarry Smith C->assembled = PETSC_TRUE; 92b0a32e0cSBarry Smith PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 933a40ed3dSBarry Smith PetscFunctionReturn(0); 944eeb42bcSBarry Smith } 954cd38560SBarry Smith /* 964cd38560SBarry Smith Version for when blocks are 2 by 2 Using natural ordering 974cd38560SBarry Smith */ 984a2ae208SSatish Balay #undef __FUNCT__ 994a2ae208SSatish Balay #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering" 1004cd38560SBarry Smith int MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat A,Mat *B) 1014cd38560SBarry Smith { 1024cd38560SBarry Smith Mat C = *B; 1034cd38560SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1044cd38560SBarry Smith int ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1052515f8d2SSatish Balay int *ajtmpold,*ajtmp,nz,row; 106329f5518SBarry Smith int *diag_offset = b->diag,*ai=a->i,*aj=a->j,*pj; 107329f5518SBarry Smith MatScalar *pv,*v,*rtmp,*pc,*w,*x; 1082515f8d2SSatish Balay MatScalar p1,p2,p3,p4,m1,m2,m3,m4,x1,x2,x3,x4; 1094cd38560SBarry Smith MatScalar *ba = b->a,*aa = a->a; 1104cd38560SBarry Smith 1114cd38560SBarry Smith PetscFunctionBegin; 112b0a32e0cSBarry Smith ierr = PetscMalloc(4*(n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 1134cd38560SBarry Smith 1144cd38560SBarry Smith for (i=0; i<n; i++) { 1154cd38560SBarry Smith nz = bi[i+1] - bi[i]; 1164cd38560SBarry Smith ajtmp = bj + bi[i]; 1174cd38560SBarry Smith for (j=0; j<nz; j++) { 1184cd38560SBarry Smith x = rtmp+4*ajtmp[j]; 1194cd38560SBarry Smith x[0] = x[1] = x[2] = x[3] = 0.0; 1204cd38560SBarry Smith } 1214cd38560SBarry Smith /* load in initial (unfactored row) */ 1224cd38560SBarry Smith nz = ai[i+1] - ai[i]; 1234cd38560SBarry Smith ajtmpold = aj + ai[i]; 1244cd38560SBarry Smith v = aa + 4*ai[i]; 1254cd38560SBarry Smith for (j=0; j<nz; j++) { 1264cd38560SBarry Smith x = rtmp+4*ajtmpold[j]; 1274cd38560SBarry Smith x[0] = v[0]; x[1] = v[1]; x[2] = v[2]; x[3] = v[3]; 1284cd38560SBarry Smith v += 4; 1294cd38560SBarry Smith } 1304cd38560SBarry Smith row = *ajtmp++; 1314cd38560SBarry Smith while (row < i) { 1324cd38560SBarry Smith pc = rtmp + 4*row; 1334cd38560SBarry Smith p1 = pc[0]; p2 = pc[1]; p3 = pc[2]; p4 = pc[3]; 1344cd38560SBarry Smith if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0) { 1354cd38560SBarry Smith pv = ba + 4*diag_offset[row]; 1364cd38560SBarry Smith pj = bj + diag_offset[row] + 1; 1374cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1384cd38560SBarry Smith pc[0] = m1 = p1*x1 + p3*x2; 1394cd38560SBarry Smith pc[1] = m2 = p2*x1 + p4*x2; 1404cd38560SBarry Smith pc[2] = m3 = p1*x3 + p3*x4; 1414cd38560SBarry Smith pc[3] = m4 = p2*x3 + p4*x4; 1424cd38560SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 1434cd38560SBarry Smith pv += 4; 1444cd38560SBarry Smith for (j=0; j<nz; j++) { 1454cd38560SBarry Smith x1 = pv[0]; x2 = pv[1]; x3 = pv[2]; x4 = pv[3]; 1464cd38560SBarry Smith x = rtmp + 4*pj[j]; 1474cd38560SBarry Smith x[0] -= m1*x1 + m3*x2; 1484cd38560SBarry Smith x[1] -= m2*x1 + m4*x2; 1494cd38560SBarry Smith x[2] -= m1*x3 + m3*x4; 1504cd38560SBarry Smith x[3] -= m2*x3 + m4*x4; 1514cd38560SBarry Smith pv += 4; 1524cd38560SBarry Smith } 153b0a32e0cSBarry Smith PetscLogFlops(16*nz+12); 1544cd38560SBarry Smith } 1554cd38560SBarry Smith row = *ajtmp++; 1564cd38560SBarry Smith } 1574cd38560SBarry Smith /* finished row so stick it into b->a */ 1584cd38560SBarry Smith pv = ba + 4*bi[i]; 1594cd38560SBarry Smith pj = bj + bi[i]; 1604cd38560SBarry Smith nz = bi[i+1] - bi[i]; 1614cd38560SBarry Smith for (j=0; j<nz; j++) { 1624cd38560SBarry Smith x = rtmp+4*pj[j]; 1634cd38560SBarry Smith pv[0] = x[0]; pv[1] = x[1]; pv[2] = x[2]; pv[3] = x[3]; 1644cd38560SBarry Smith pv += 4; 1654cd38560SBarry Smith } 1664cd38560SBarry Smith /* invert diagonal block */ 1674cd38560SBarry Smith w = ba + 4*diag_offset[i]; 1684cd38560SBarry Smith ierr = Kernel_A_gets_inverse_A_2(w);CHKERRQ(ierr); 1694cd38560SBarry Smith /*Kernel_A_gets_inverse_A(bs,w,v_pivots,v_work);*/ 1704cd38560SBarry Smith } 1714cd38560SBarry Smith 172606d414cSSatish Balay ierr = PetscFree(rtmp);CHKERRQ(ierr); 1734cd38560SBarry Smith C->factor = FACTOR_LU; 1744cd38560SBarry Smith C->assembled = PETSC_TRUE; 175b0a32e0cSBarry Smith PetscLogFlops(1.3333*8*b->mbs); /* from inverting diagonal blocks */ 1764cd38560SBarry Smith PetscFunctionReturn(0); 1774cd38560SBarry Smith } 1787fc0212eSBarry Smith 1797fc0212eSBarry Smith /* ----------------------------------------------------------- */ 1807fc0212eSBarry Smith /* 1817fc0212eSBarry Smith Version for when blocks are 1 by 1. 1827fc0212eSBarry Smith */ 1834a2ae208SSatish Balay #undef __FUNCT__ 1844a2ae208SSatish Balay #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_1" 1857fc0212eSBarry Smith int MatLUFactorNumeric_SeqBAIJ_1(Mat A,Mat *B) 1867fc0212eSBarry Smith { 1877fc0212eSBarry Smith Mat C = *B; 1887fc0212eSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ *)C->data; 1897cdf2dbbSSatish Balay IS isrow = b->row,isicol = b->icol; 1904078e994SBarry Smith int *r,*ic,ierr,i,j,n = a->mbs,*bi = b->i,*bj = b->j; 1914078e994SBarry Smith int *ajtmpold,*ajtmp,nz,row,*ai = a->i,*aj = a->j; 192329f5518SBarry Smith int *diag_offset = b->diag,diag,*pj; 193329f5518SBarry Smith MatScalar *pv,*v,*rtmp,multiplier,*pc; 1943f1db9ecSBarry Smith MatScalar *ba = b->a,*aa = a->a; 1957fc0212eSBarry Smith 1963a40ed3dSBarry Smith PetscFunctionBegin; 1977fc0212eSBarry Smith ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 1987fc0212eSBarry Smith ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 199b0a32e0cSBarry Smith ierr = PetscMalloc((n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 2007fc0212eSBarry Smith 2017fc0212eSBarry Smith for (i=0; i<n; i++) { 2024078e994SBarry Smith nz = bi[i+1] - bi[i]; 2034078e994SBarry Smith ajtmp = bj + bi[i]; 2047fc0212eSBarry Smith for (j=0; j<nz; j++) rtmp[ajtmp[j]] = 0.0; 2057fc0212eSBarry Smith 2067fc0212eSBarry Smith /* load in initial (unfactored row) */ 2074078e994SBarry Smith nz = ai[r[i]+1] - ai[r[i]]; 2084078e994SBarry Smith ajtmpold = aj + ai[r[i]]; 2094078e994SBarry Smith v = aa + ai[r[i]]; 2107fc0212eSBarry Smith for (j=0; j<nz; j++) rtmp[ic[ajtmpold[j]]] = v[j]; 2117fc0212eSBarry Smith 2127fc0212eSBarry Smith row = *ajtmp++; 2137fc0212eSBarry Smith while (row < i) { 2147fc0212eSBarry Smith pc = rtmp + row; 2157fc0212eSBarry Smith if (*pc != 0.0) { 2164078e994SBarry Smith pv = ba + diag_offset[row]; 2174078e994SBarry Smith pj = bj + diag_offset[row] + 1; 2187fc0212eSBarry Smith multiplier = *pc * *pv++; 2197fc0212eSBarry Smith *pc = multiplier; 2204078e994SBarry Smith nz = bi[row+1] - diag_offset[row] - 1; 2217fc0212eSBarry Smith for (j=0; j<nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 222b0a32e0cSBarry Smith PetscLogFlops(1+2*nz); 2237fc0212eSBarry Smith } 2247fc0212eSBarry Smith row = *ajtmp++; 2257fc0212eSBarry Smith } 2267fc0212eSBarry Smith /* finished row so stick it into b->a */ 2274078e994SBarry Smith pv = ba + bi[i]; 2284078e994SBarry Smith pj = bj + bi[i]; 2294078e994SBarry Smith nz = bi[i+1] - bi[i]; 2307fc0212eSBarry Smith for (j=0; j<nz; j++) {pv[j] = rtmp[pj[j]];} 2314078e994SBarry Smith diag = diag_offset[i] - bi[i]; 2327fc0212eSBarry Smith /* check pivot entry for current row */ 233*a73cf429SBarry Smith if (pv[diag] == 0.0) { 23429bbc08cSBarry Smith SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Zero pivot"); 2357fc0212eSBarry Smith } 2367fc0212eSBarry Smith pv[diag] = 1.0/pv[diag]; 2377fc0212eSBarry Smith } 2387fc0212eSBarry Smith 239606d414cSSatish Balay ierr = PetscFree(rtmp);CHKERRQ(ierr); 2407fc0212eSBarry Smith ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2417fc0212eSBarry Smith ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2427fc0212eSBarry Smith C->factor = FACTOR_LU; 2437fc0212eSBarry Smith C->assembled = PETSC_TRUE; 244b0a32e0cSBarry Smith PetscLogFlops(C->n); 2453a40ed3dSBarry Smith PetscFunctionReturn(0); 2467fc0212eSBarry Smith } 2477fc0212eSBarry Smith 248273d9f13SBarry Smith 2495d517e7eSBarry Smith /* ----------------------------------------------------------- */ 2504a2ae208SSatish Balay #undef __FUNCT__ 2514a2ae208SSatish Balay #define __FUNCT__ "MatLUFactor_SeqBAIJ" 252b380c88cSHong Zhang int MatLUFactor_SeqBAIJ(Mat A,IS row,IS col,MatFactorInfo *info) 2535d517e7eSBarry Smith { 254273d9f13SBarry Smith int ierr; 2555d517e7eSBarry Smith Mat C; 2565d517e7eSBarry Smith 2573a40ed3dSBarry Smith PetscFunctionBegin; 258b9b97703SBarry Smith ierr = MatLUFactorSymbolic(A,row,col,info,&C);CHKERRQ(ierr); 2597fc0212eSBarry Smith ierr = MatLUFactorNumeric(A,&C);CHKERRQ(ierr); 260273d9f13SBarry Smith ierr = MatHeaderCopy(A,C);CHKERRQ(ierr); 261b0a32e0cSBarry Smith PetscLogObjectParent(A,((Mat_SeqBAIJ*)(A->data))->icol); 2623a40ed3dSBarry Smith PetscFunctionReturn(0); 2635d517e7eSBarry Smith } 2644cd38560SBarry Smith 2654cd38560SBarry Smith 266