1be1d678aSKris Buschelman 25d517e7eSBarry Smith /* 3ec3a8b7bSBarry Smith Factorization code for BAIJ format. 45d517e7eSBarry Smith */ 5c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h> 6af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 7ec3a8b7bSBarry Smith 8d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info) 9d71ae5a4SJacob Faibussowitsch { 10b588c5a2SHong Zhang Mat C = B; 11b588c5a2SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 12b588c5a2SHong Zhang IS isrow = b->row, isicol = b->icol; 135a586d82SBarry Smith const PetscInt *r, *ic; 14bbd65245SShri Abhyankar const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 15bbd65245SShri Abhyankar const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 16bbd65245SShri Abhyankar MatScalar *rtmp, *pc, *mwork, *pv; 17bbd65245SShri Abhyankar MatScalar *aa = a->a, *v; 18182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 19*ff6a9541SJacob Faibussowitsch const PetscBool allowzeropivot = PetscNot(A->erroriffailure); 20b588c5a2SHong Zhang 21b588c5a2SHong Zhang PetscFunctionBegin; 229566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 239566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 24b588c5a2SHong Zhang 254c000e2eSHong Zhang /* generate work space needed by the factorization */ 269566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 279566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rtmp, bs2 * n)); 28b588c5a2SHong Zhang 29*ff6a9541SJacob Faibussowitsch for (PetscInt i = 0; i < n; i++) { 30b588c5a2SHong Zhang /* zero rtmp */ 31b588c5a2SHong Zhang /* L part */ 32*ff6a9541SJacob Faibussowitsch PetscInt *pj; 33*ff6a9541SJacob Faibussowitsch PetscInt nzL, nz = bi[i + 1] - bi[i]; 34b588c5a2SHong Zhang bjtmp = bj + bi[i]; 35*ff6a9541SJacob Faibussowitsch for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 36b588c5a2SHong Zhang 37b588c5a2SHong Zhang /* U part */ 380c4413a7SShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 390c4413a7SShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 40*ff6a9541SJacob Faibussowitsch for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 410c4413a7SShri Abhyankar 420c4413a7SShri Abhyankar /* load in initial (unfactored row) */ 430c4413a7SShri Abhyankar nz = ai[r[i] + 1] - ai[r[i]]; 440c4413a7SShri Abhyankar ajtmp = aj + ai[r[i]]; 450c4413a7SShri Abhyankar v = aa + bs2 * ai[r[i]]; 46*ff6a9541SJacob Faibussowitsch for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2)); 470c4413a7SShri Abhyankar 480c4413a7SShri Abhyankar /* elimination */ 490c4413a7SShri Abhyankar bjtmp = bj + bi[i]; 500c4413a7SShri Abhyankar nzL = bi[i + 1] - bi[i]; 51*ff6a9541SJacob Faibussowitsch for (PetscInt k = 0; k < nzL; k++) { 52*ff6a9541SJacob Faibussowitsch PetscBool flg = PETSC_FALSE; 53*ff6a9541SJacob Faibussowitsch PetscInt row = bjtmp[k]; 54*ff6a9541SJacob Faibussowitsch 550c4413a7SShri Abhyankar pc = rtmp + bs2 * row; 56*ff6a9541SJacob Faibussowitsch for (PetscInt j = 0; j < bs2; j++) { 57c35f09e5SBarry Smith if (pc[j] != (PetscScalar)0.0) { 58*ff6a9541SJacob Faibussowitsch flg = PETSC_TRUE; 59c35f09e5SBarry Smith break; 60c35f09e5SBarry Smith } 61c35f09e5SBarry Smith } 620c4413a7SShri Abhyankar if (flg) { 630c4413a7SShri Abhyankar pv = b->a + bs2 * bdiag[row]; 6496b95a6bSBarry Smith /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 659566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 660c4413a7SShri Abhyankar 67a5b23f4aSJose E. Roman pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 680c4413a7SShri Abhyankar pv = b->a + bs2 * (bdiag[row + 1] + 1); 690c4413a7SShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */ 70*ff6a9541SJacob Faibussowitsch for (PetscInt j = 0; j < nz; j++) { 7196b95a6bSBarry Smith /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 720c4413a7SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 730c4413a7SShri Abhyankar v = rtmp + 4 * pj[j]; 749566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 750c4413a7SShri Abhyankar pv += 4; 760c4413a7SShri Abhyankar } 779566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 780c4413a7SShri Abhyankar } 790c4413a7SShri Abhyankar } 800c4413a7SShri Abhyankar 810c4413a7SShri Abhyankar /* finished row so stick it into b->a */ 820c4413a7SShri Abhyankar /* L part */ 830c4413a7SShri Abhyankar pv = b->a + bs2 * bi[i]; 840c4413a7SShri Abhyankar pj = b->j + bi[i]; 850c4413a7SShri Abhyankar nz = bi[i + 1] - bi[i]; 86*ff6a9541SJacob Faibussowitsch for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 870c4413a7SShri Abhyankar 88a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 890c4413a7SShri Abhyankar pv = b->a + bs2 * bdiag[i]; 900c4413a7SShri Abhyankar pj = b->j + bdiag[i]; 919566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 92*ff6a9541SJacob Faibussowitsch { 93*ff6a9541SJacob Faibussowitsch PetscBool zeropivotdetected; 94*ff6a9541SJacob Faibussowitsch 959566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 967b6c816cSBarry Smith if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 97*ff6a9541SJacob Faibussowitsch } 980c4413a7SShri Abhyankar 990c4413a7SShri Abhyankar /* U part */ 1000c4413a7SShri Abhyankar pv = b->a + bs2 * (bdiag[i + 1] + 1); 1010c4413a7SShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 1020c4413a7SShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; 103*ff6a9541SJacob Faibussowitsch for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 1040c4413a7SShri Abhyankar } 1050c4413a7SShri Abhyankar 1069566063dSJacob Faibussowitsch PetscCall(PetscFree2(rtmp, mwork)); 1079566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 1089566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 10926fbe8dcSKarl Rupp 1104dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_2; 1114dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 1120c4413a7SShri Abhyankar C->assembled = PETSC_TRUE; 11326fbe8dcSKarl Rupp 1149566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 1153ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1160c4413a7SShri Abhyankar } 1170c4413a7SShri Abhyankar 118d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) 119d71ae5a4SJacob Faibussowitsch { 1204c000e2eSHong Zhang Mat C = B; 1214c000e2eSHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 122bbd65245SShri Abhyankar PetscInt i, j, k, nz, nzL, row, *pj; 123bbd65245SShri Abhyankar const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2; 124bbd65245SShri Abhyankar const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag; 125bbd65245SShri Abhyankar MatScalar *rtmp, *pc, *mwork, *pv; 126bbd65245SShri Abhyankar MatScalar *aa = a->a, *v; 127bbd65245SShri Abhyankar PetscInt flg; 128182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 129a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected; 1304c000e2eSHong Zhang 1314c000e2eSHong Zhang PetscFunctionBegin; 1320164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 1330164db54SHong Zhang 1344c000e2eSHong Zhang /* generate work space needed by the factorization */ 1359566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork)); 1369566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rtmp, bs2 * n)); 1374c000e2eSHong Zhang 1384c000e2eSHong Zhang for (i = 0; i < n; i++) { 1394c000e2eSHong Zhang /* zero rtmp */ 1404c000e2eSHong Zhang /* L part */ 1414c000e2eSHong Zhang nz = bi[i + 1] - bi[i]; 1424c000e2eSHong Zhang bjtmp = bj + bi[i]; 14348a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 1444c000e2eSHong Zhang 1454c000e2eSHong Zhang /* U part */ 146b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 147b2b2dd24SShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 14848a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2)); 149b2b2dd24SShri Abhyankar 150b2b2dd24SShri Abhyankar /* load in initial (unfactored row) */ 151b2b2dd24SShri Abhyankar nz = ai[i + 1] - ai[i]; 152b2b2dd24SShri Abhyankar ajtmp = aj + ai[i]; 153b2b2dd24SShri Abhyankar v = aa + bs2 * ai[i]; 15448a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2)); 155b2b2dd24SShri Abhyankar 156b2b2dd24SShri Abhyankar /* elimination */ 157b2b2dd24SShri Abhyankar bjtmp = bj + bi[i]; 158b2b2dd24SShri Abhyankar nzL = bi[i + 1] - bi[i]; 159b2b2dd24SShri Abhyankar for (k = 0; k < nzL; k++) { 160b2b2dd24SShri Abhyankar row = bjtmp[k]; 161b2b2dd24SShri Abhyankar pc = rtmp + bs2 * row; 162c35f09e5SBarry Smith for (flg = 0, j = 0; j < bs2; j++) { 163c35f09e5SBarry Smith if (pc[j] != (PetscScalar)0.0) { 164c35f09e5SBarry Smith flg = 1; 165c35f09e5SBarry Smith break; 166c35f09e5SBarry Smith } 167c35f09e5SBarry Smith } 168b2b2dd24SShri Abhyankar if (flg) { 169b2b2dd24SShri Abhyankar pv = b->a + bs2 * bdiag[row]; 17096b95a6bSBarry Smith /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */ 1719566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork)); 172b2b2dd24SShri Abhyankar 173b2b2dd24SShri Abhyankar pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 174b2b2dd24SShri Abhyankar pv = b->a + bs2 * (bdiag[row + 1] + 1); 175b2b2dd24SShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 176b2b2dd24SShri Abhyankar for (j = 0; j < nz; j++) { 17796b95a6bSBarry Smith /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */ 178b2b2dd24SShri Abhyankar /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */ 179b2b2dd24SShri Abhyankar v = rtmp + 4 * pj[j]; 1809566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv)); 181b2b2dd24SShri Abhyankar pv += 4; 182b2b2dd24SShri Abhyankar } 1839566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 184b2b2dd24SShri Abhyankar } 185b2b2dd24SShri Abhyankar } 186b2b2dd24SShri Abhyankar 187b2b2dd24SShri Abhyankar /* finished row so stick it into b->a */ 188b2b2dd24SShri Abhyankar /* L part */ 189b2b2dd24SShri Abhyankar pv = b->a + bs2 * bi[i]; 190b2b2dd24SShri Abhyankar pj = b->j + bi[i]; 191b2b2dd24SShri Abhyankar nz = bi[i + 1] - bi[i]; 19248a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 193b2b2dd24SShri Abhyankar 194a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 195b2b2dd24SShri Abhyankar pv = b->a + bs2 * bdiag[i]; 196b2b2dd24SShri Abhyankar pj = b->j + bdiag[i]; 1979566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2)); 1989566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected)); 1997b6c816cSBarry Smith if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 200b2b2dd24SShri Abhyankar 201b2b2dd24SShri Abhyankar /* U part */ 202b2b2dd24SShri Abhyankar /* 203b2b2dd24SShri Abhyankar pv = b->a + bs2*bi[2*n-i]; 204b2b2dd24SShri Abhyankar pj = b->j + bi[2*n-i]; 205b2b2dd24SShri Abhyankar nz = bi[2*n-i+1] - bi[2*n-i] - 1; 206b2b2dd24SShri Abhyankar */ 207b2b2dd24SShri Abhyankar pv = b->a + bs2 * (bdiag[i + 1] + 1); 208b2b2dd24SShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 209b2b2dd24SShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; 21048a46eb9SPierre Jolivet for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2)); 211b2b2dd24SShri Abhyankar } 2129566063dSJacob Faibussowitsch PetscCall(PetscFree2(rtmp, mwork)); 213ae3d28f0SHong Zhang 2144dd39f65SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 2159f5c0bcdSBarry Smith C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_2_NaturalOrdering; 2169f5c0bcdSBarry Smith C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering; 2174dd39f65SShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 218b2b2dd24SShri Abhyankar C->assembled = PETSC_TRUE; 21926fbe8dcSKarl Rupp 2209566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */ 2213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 222b2b2dd24SShri Abhyankar } 223b2b2dd24SShri Abhyankar 224d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info) 225d71ae5a4SJacob Faibussowitsch { 226719d5645SBarry Smith Mat C = B; 2274eeb42bcSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 2287cdf2dbbSSatish Balay IS isrow = b->row, isicol = b->icol; 2295d0c19d7SBarry Smith const PetscInt *r, *ic; 2305d0c19d7SBarry Smith PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 231690b6cddSBarry Smith PetscInt *ajtmpold, *ajtmp, nz, row; 232690b6cddSBarry Smith PetscInt *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj; 233329f5518SBarry Smith MatScalar *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4; 2342515f8d2SSatish Balay MatScalar p1, p2, p3, p4; 2353f1db9ecSBarry Smith MatScalar *ba = b->a, *aa = a->a; 236182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 237a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected; 2384eeb42bcSBarry Smith 2393a40ed3dSBarry Smith PetscFunctionBegin; 2400164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 2419566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 2429566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 2439566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 2444eeb42bcSBarry Smith 2454eeb42bcSBarry Smith for (i = 0; i < n; i++) { 2464078e994SBarry Smith nz = bi[i + 1] - bi[i]; 2474078e994SBarry Smith ajtmp = bj + bi[i]; 2484eeb42bcSBarry Smith for (j = 0; j < nz; j++) { 2499371c9d4SSatish Balay x = rtmp + 4 * ajtmp[j]; 2509371c9d4SSatish Balay x[0] = x[1] = x[2] = x[3] = 0.0; 2514eeb42bcSBarry Smith } 2524eeb42bcSBarry Smith /* load in initial (unfactored row) */ 2534eeb42bcSBarry Smith idx = r[i]; 2544078e994SBarry Smith nz = ai[idx + 1] - ai[idx]; 2554078e994SBarry Smith ajtmpold = aj + ai[idx]; 2564078e994SBarry Smith v = aa + 4 * ai[idx]; 2574eeb42bcSBarry Smith for (j = 0; j < nz; j++) { 2584eeb42bcSBarry Smith x = rtmp + 4 * ic[ajtmpold[j]]; 2599371c9d4SSatish Balay x[0] = v[0]; 2609371c9d4SSatish Balay x[1] = v[1]; 2619371c9d4SSatish Balay x[2] = v[2]; 2629371c9d4SSatish Balay x[3] = v[3]; 2634eeb42bcSBarry Smith v += 4; 2644eeb42bcSBarry Smith } 2654eeb42bcSBarry Smith row = *ajtmp++; 2664eeb42bcSBarry Smith while (row < i) { 2674eeb42bcSBarry Smith pc = rtmp + 4 * row; 2689371c9d4SSatish Balay p1 = pc[0]; 2699371c9d4SSatish Balay p2 = pc[1]; 2709371c9d4SSatish Balay p3 = pc[2]; 2719371c9d4SSatish Balay p4 = pc[3]; 272d4a378daSJed Brown if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 2734078e994SBarry Smith pv = ba + 4 * diag_offset[row]; 2744078e994SBarry Smith pj = bj + diag_offset[row] + 1; 2759371c9d4SSatish Balay x1 = pv[0]; 2769371c9d4SSatish Balay x2 = pv[1]; 2779371c9d4SSatish Balay x3 = pv[2]; 2789371c9d4SSatish Balay x4 = pv[3]; 2794eeb42bcSBarry Smith pc[0] = m1 = p1 * x1 + p3 * x2; 2804eeb42bcSBarry Smith pc[1] = m2 = p2 * x1 + p4 * x2; 2814eeb42bcSBarry Smith pc[2] = m3 = p1 * x3 + p3 * x4; 2824eeb42bcSBarry Smith pc[3] = m4 = p2 * x3 + p4 * x4; 2834078e994SBarry Smith nz = bi[row + 1] - diag_offset[row] - 1; 2844eeb42bcSBarry Smith pv += 4; 2854eeb42bcSBarry Smith for (j = 0; j < nz; j++) { 2869371c9d4SSatish Balay x1 = pv[0]; 2879371c9d4SSatish Balay x2 = pv[1]; 2889371c9d4SSatish Balay x3 = pv[2]; 2899371c9d4SSatish Balay x4 = pv[3]; 2904eeb42bcSBarry Smith x = rtmp + 4 * pj[j]; 2914eeb42bcSBarry Smith x[0] -= m1 * x1 + m3 * x2; 2924eeb42bcSBarry Smith x[1] -= m2 * x1 + m4 * x2; 2934eeb42bcSBarry Smith x[2] -= m1 * x3 + m3 * x4; 2944eeb42bcSBarry Smith x[3] -= m2 * x3 + m4 * x4; 2954eeb42bcSBarry Smith pv += 4; 2964eeb42bcSBarry Smith } 2979566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 2984eeb42bcSBarry Smith } 2994eeb42bcSBarry Smith row = *ajtmp++; 3004eeb42bcSBarry Smith } 3014eeb42bcSBarry Smith /* finished row so stick it into b->a */ 3024078e994SBarry Smith pv = ba + 4 * bi[i]; 3034078e994SBarry Smith pj = bj + bi[i]; 3044078e994SBarry Smith nz = bi[i + 1] - bi[i]; 3054eeb42bcSBarry Smith for (j = 0; j < nz; j++) { 3064eeb42bcSBarry Smith x = rtmp + 4 * pj[j]; 3079371c9d4SSatish Balay pv[0] = x[0]; 3089371c9d4SSatish Balay pv[1] = x[1]; 3099371c9d4SSatish Balay pv[2] = x[2]; 3109371c9d4SSatish Balay pv[3] = x[3]; 3114eeb42bcSBarry Smith pv += 4; 3124eeb42bcSBarry Smith } 3134eeb42bcSBarry Smith /* invert diagonal block */ 3144078e994SBarry Smith w = ba + 4 * diag_offset[i]; 3159566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 3167b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 3174eeb42bcSBarry Smith } 3184eeb42bcSBarry Smith 3199566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 3209566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 3219566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 32226fbe8dcSKarl Rupp 32306e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_2_inplace; 32406e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace; 3254eeb42bcSBarry Smith C->assembled = PETSC_TRUE; 32626fbe8dcSKarl Rupp 3279566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 3283ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 3294eeb42bcSBarry Smith } 3304cd38560SBarry Smith /* 3314cd38560SBarry Smith Version for when blocks are 2 by 2 Using natural ordering 3324cd38560SBarry Smith */ 333d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) 334d71ae5a4SJacob Faibussowitsch { 3354cd38560SBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 336690b6cddSBarry Smith PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 337690b6cddSBarry Smith PetscInt *ajtmpold, *ajtmp, nz, row; 338690b6cddSBarry Smith PetscInt *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj; 339329f5518SBarry Smith MatScalar *pv, *v, *rtmp, *pc, *w, *x; 3402515f8d2SSatish Balay MatScalar p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4; 3414cd38560SBarry Smith MatScalar *ba = b->a, *aa = a->a; 342182b8fbaSHong Zhang PetscReal shift = info->shiftamount; 343a455e926SHong Zhang PetscBool allowzeropivot, zeropivotdetected; 3444cd38560SBarry Smith 3454cd38560SBarry Smith PetscFunctionBegin; 3460164db54SHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 3479566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(4 * (n + 1), &rtmp)); 3484cd38560SBarry Smith for (i = 0; i < n; i++) { 3494cd38560SBarry Smith nz = bi[i + 1] - bi[i]; 3504cd38560SBarry Smith ajtmp = bj + bi[i]; 3514cd38560SBarry Smith for (j = 0; j < nz; j++) { 3524cd38560SBarry Smith x = rtmp + 4 * ajtmp[j]; 3534cd38560SBarry Smith x[0] = x[1] = x[2] = x[3] = 0.0; 3544cd38560SBarry Smith } 3554cd38560SBarry Smith /* load in initial (unfactored row) */ 3564cd38560SBarry Smith nz = ai[i + 1] - ai[i]; 3574cd38560SBarry Smith ajtmpold = aj + ai[i]; 3584cd38560SBarry Smith v = aa + 4 * ai[i]; 3594cd38560SBarry Smith for (j = 0; j < nz; j++) { 3604cd38560SBarry Smith x = rtmp + 4 * ajtmpold[j]; 3619371c9d4SSatish Balay x[0] = v[0]; 3629371c9d4SSatish Balay x[1] = v[1]; 3639371c9d4SSatish Balay x[2] = v[2]; 3649371c9d4SSatish Balay x[3] = v[3]; 3654cd38560SBarry Smith v += 4; 3664cd38560SBarry Smith } 3674cd38560SBarry Smith row = *ajtmp++; 3684cd38560SBarry Smith while (row < i) { 3694cd38560SBarry Smith pc = rtmp + 4 * row; 3709371c9d4SSatish Balay p1 = pc[0]; 3719371c9d4SSatish Balay p2 = pc[1]; 3729371c9d4SSatish Balay p3 = pc[2]; 3739371c9d4SSatish Balay p4 = pc[3]; 374d4a378daSJed Brown if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) { 3754cd38560SBarry Smith pv = ba + 4 * diag_offset[row]; 3764cd38560SBarry Smith pj = bj + diag_offset[row] + 1; 3779371c9d4SSatish Balay x1 = pv[0]; 3789371c9d4SSatish Balay x2 = pv[1]; 3799371c9d4SSatish Balay x3 = pv[2]; 3809371c9d4SSatish Balay x4 = pv[3]; 3814cd38560SBarry Smith pc[0] = m1 = p1 * x1 + p3 * x2; 3824cd38560SBarry Smith pc[1] = m2 = p2 * x1 + p4 * x2; 3834cd38560SBarry Smith pc[2] = m3 = p1 * x3 + p3 * x4; 3844cd38560SBarry Smith pc[3] = m4 = p2 * x3 + p4 * x4; 3854cd38560SBarry Smith nz = bi[row + 1] - diag_offset[row] - 1; 3864cd38560SBarry Smith pv += 4; 3874cd38560SBarry Smith for (j = 0; j < nz; j++) { 3889371c9d4SSatish Balay x1 = pv[0]; 3899371c9d4SSatish Balay x2 = pv[1]; 3909371c9d4SSatish Balay x3 = pv[2]; 3919371c9d4SSatish Balay x4 = pv[3]; 3924cd38560SBarry Smith x = rtmp + 4 * pj[j]; 3934cd38560SBarry Smith x[0] -= m1 * x1 + m3 * x2; 3944cd38560SBarry Smith x[1] -= m2 * x1 + m4 * x2; 3954cd38560SBarry Smith x[2] -= m1 * x3 + m3 * x4; 3964cd38560SBarry Smith x[3] -= m2 * x3 + m4 * x4; 3974cd38560SBarry Smith pv += 4; 3984cd38560SBarry Smith } 3999566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(16.0 * nz + 12.0)); 4004cd38560SBarry Smith } 4014cd38560SBarry Smith row = *ajtmp++; 4024cd38560SBarry Smith } 4034cd38560SBarry Smith /* finished row so stick it into b->a */ 4044cd38560SBarry Smith pv = ba + 4 * bi[i]; 4054cd38560SBarry Smith pj = bj + bi[i]; 4064cd38560SBarry Smith nz = bi[i + 1] - bi[i]; 4074cd38560SBarry Smith for (j = 0; j < nz; j++) { 4084cd38560SBarry Smith x = rtmp + 4 * pj[j]; 4099371c9d4SSatish Balay pv[0] = x[0]; 4109371c9d4SSatish Balay pv[1] = x[1]; 4119371c9d4SSatish Balay pv[2] = x[2]; 4129371c9d4SSatish Balay pv[3] = x[3]; 4132efa7f71SHong Zhang /* 4142efa7f71SHong Zhang printf(" col %d:",pj[j]); 4152efa7f71SHong Zhang PetscInt j1; 4162efa7f71SHong Zhang for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1)); 4172efa7f71SHong Zhang printf("\n"); 4182efa7f71SHong Zhang */ 4194cd38560SBarry Smith pv += 4; 4204cd38560SBarry Smith } 4214cd38560SBarry Smith /* invert diagonal block */ 4224cd38560SBarry Smith w = ba + 4 * diag_offset[i]; 4239566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected)); 4247b6c816cSBarry Smith if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 4254cd38560SBarry Smith } 4264cd38560SBarry Smith 4279566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 42826fbe8dcSKarl Rupp 42906e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace; 43006e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace; 4314cd38560SBarry Smith C->assembled = PETSC_TRUE; 43226fbe8dcSKarl Rupp 4339566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */ 4343ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 4354cd38560SBarry Smith } 4367fc0212eSBarry Smith 4377fc0212eSBarry Smith /* ----------------------------------------------------------- */ 4387fc0212eSBarry Smith /* 4397fc0212eSBarry Smith Version for when blocks are 1 by 1. 4407fc0212eSBarry Smith */ 441d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info) 442d71ae5a4SJacob Faibussowitsch { 443048b5e81SShri Abhyankar Mat C = B; 444048b5e81SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 445048b5e81SShri Abhyankar IS isrow = b->row, isicol = b->icol; 446048b5e81SShri Abhyankar const PetscInt *r, *ic, *ics; 447048b5e81SShri Abhyankar const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag; 448048b5e81SShri Abhyankar PetscInt i, j, k, nz, nzL, row, *pj; 449048b5e81SShri Abhyankar const PetscInt *ajtmp, *bjtmp; 450048b5e81SShri Abhyankar MatScalar *rtmp, *pc, multiplier, *pv; 451048b5e81SShri Abhyankar const MatScalar *aa = a->a, *v; 452ace3abfcSBarry Smith PetscBool row_identity, col_identity; 453048b5e81SShri Abhyankar FactorShiftCtx sctx; 454048b5e81SShri Abhyankar const PetscInt *ddiag; 455048b5e81SShri Abhyankar PetscReal rs; 456048b5e81SShri Abhyankar MatScalar d; 457048b5e81SShri Abhyankar 458048b5e81SShri Abhyankar PetscFunctionBegin; 459048b5e81SShri Abhyankar /* MatPivotSetUp(): initialize shift context sctx */ 4609566063dSJacob Faibussowitsch PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 461048b5e81SShri Abhyankar 462048b5e81SShri Abhyankar if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */ 463048b5e81SShri Abhyankar ddiag = a->diag; 464048b5e81SShri Abhyankar sctx.shift_top = info->zeropivot; 465048b5e81SShri Abhyankar for (i = 0; i < n; i++) { 466048b5e81SShri Abhyankar /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */ 467048b5e81SShri Abhyankar d = (aa)[ddiag[i]]; 468048b5e81SShri Abhyankar rs = -PetscAbsScalar(d) - PetscRealPart(d); 469048b5e81SShri Abhyankar v = aa + ai[i]; 470048b5e81SShri Abhyankar nz = ai[i + 1] - ai[i]; 47126fbe8dcSKarl Rupp for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]); 472048b5e81SShri Abhyankar if (rs > sctx.shift_top) sctx.shift_top = rs; 473048b5e81SShri Abhyankar } 474048b5e81SShri Abhyankar sctx.shift_top *= 1.1; 475048b5e81SShri Abhyankar sctx.nshift_max = 5; 476048b5e81SShri Abhyankar sctx.shift_lo = 0.; 477048b5e81SShri Abhyankar sctx.shift_hi = 1.; 478048b5e81SShri Abhyankar } 479048b5e81SShri Abhyankar 4809566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 4819566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 4829566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &rtmp)); 483048b5e81SShri Abhyankar ics = ic; 484048b5e81SShri Abhyankar 485048b5e81SShri Abhyankar do { 486048b5e81SShri Abhyankar sctx.newshift = PETSC_FALSE; 487048b5e81SShri Abhyankar for (i = 0; i < n; i++) { 488048b5e81SShri Abhyankar /* zero rtmp */ 489048b5e81SShri Abhyankar /* L part */ 490048b5e81SShri Abhyankar nz = bi[i + 1] - bi[i]; 491048b5e81SShri Abhyankar bjtmp = bj + bi[i]; 492048b5e81SShri Abhyankar for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 493048b5e81SShri Abhyankar 494048b5e81SShri Abhyankar /* U part */ 495048b5e81SShri Abhyankar nz = bdiag[i] - bdiag[i + 1]; 496048b5e81SShri Abhyankar bjtmp = bj + bdiag[i + 1] + 1; 497048b5e81SShri Abhyankar for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0; 498048b5e81SShri Abhyankar 499048b5e81SShri Abhyankar /* load in initial (unfactored row) */ 500048b5e81SShri Abhyankar nz = ai[r[i] + 1] - ai[r[i]]; 501048b5e81SShri Abhyankar ajtmp = aj + ai[r[i]]; 502048b5e81SShri Abhyankar v = aa + ai[r[i]]; 50326fbe8dcSKarl Rupp for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j]; 50426fbe8dcSKarl Rupp 505048b5e81SShri Abhyankar /* ZeropivotApply() */ 506048b5e81SShri Abhyankar rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */ 507048b5e81SShri Abhyankar 508048b5e81SShri Abhyankar /* elimination */ 509048b5e81SShri Abhyankar bjtmp = bj + bi[i]; 510048b5e81SShri Abhyankar row = *bjtmp++; 511048b5e81SShri Abhyankar nzL = bi[i + 1] - bi[i]; 512048b5e81SShri Abhyankar for (k = 0; k < nzL; k++) { 513048b5e81SShri Abhyankar pc = rtmp + row; 514d4a378daSJed Brown if (*pc != (PetscScalar)0.0) { 515048b5e81SShri Abhyankar pv = b->a + bdiag[row]; 516048b5e81SShri Abhyankar multiplier = *pc * (*pv); 517048b5e81SShri Abhyankar *pc = multiplier; 51826fbe8dcSKarl Rupp 519048b5e81SShri Abhyankar pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */ 520048b5e81SShri Abhyankar pv = b->a + bdiag[row + 1] + 1; 521048b5e81SShri Abhyankar nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */ 522048b5e81SShri Abhyankar for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 5239566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * nz)); 524048b5e81SShri Abhyankar } 525048b5e81SShri Abhyankar row = *bjtmp++; 526048b5e81SShri Abhyankar } 527048b5e81SShri Abhyankar 528048b5e81SShri Abhyankar /* finished row so stick it into b->a */ 529048b5e81SShri Abhyankar rs = 0.0; 530048b5e81SShri Abhyankar /* L part */ 531048b5e81SShri Abhyankar pv = b->a + bi[i]; 532048b5e81SShri Abhyankar pj = b->j + bi[i]; 533048b5e81SShri Abhyankar nz = bi[i + 1] - bi[i]; 534048b5e81SShri Abhyankar for (j = 0; j < nz; j++) { 5359371c9d4SSatish Balay pv[j] = rtmp[pj[j]]; 5369371c9d4SSatish Balay rs += PetscAbsScalar(pv[j]); 537048b5e81SShri Abhyankar } 538048b5e81SShri Abhyankar 539048b5e81SShri Abhyankar /* U part */ 540048b5e81SShri Abhyankar pv = b->a + bdiag[i + 1] + 1; 541048b5e81SShri Abhyankar pj = b->j + bdiag[i + 1] + 1; 542048b5e81SShri Abhyankar nz = bdiag[i] - bdiag[i + 1] - 1; 543048b5e81SShri Abhyankar for (j = 0; j < nz; j++) { 5449371c9d4SSatish Balay pv[j] = rtmp[pj[j]]; 5459371c9d4SSatish Balay rs += PetscAbsScalar(pv[j]); 546048b5e81SShri Abhyankar } 547048b5e81SShri Abhyankar 548048b5e81SShri Abhyankar sctx.rs = rs; 549048b5e81SShri Abhyankar sctx.pv = rtmp[i]; 5509566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(B, A, info, &sctx, i)); 551048b5e81SShri Abhyankar if (sctx.newshift) break; /* break for-loop */ 552048b5e81SShri Abhyankar rtmp[i] = sctx.pv; /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */ 553048b5e81SShri Abhyankar 554a5b23f4aSJose E. Roman /* Mark diagonal and invert diagonal for simpler triangular solves */ 555048b5e81SShri Abhyankar pv = b->a + bdiag[i]; 556d4a378daSJed Brown *pv = (PetscScalar)1.0 / rtmp[i]; 557048b5e81SShri Abhyankar 558048b5e81SShri Abhyankar } /* endof for (i=0; i<n; i++) { */ 559048b5e81SShri Abhyankar 560048b5e81SShri Abhyankar /* MatPivotRefine() */ 561048b5e81SShri Abhyankar if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) { 562048b5e81SShri Abhyankar /* 563048b5e81SShri Abhyankar * if no shift in this attempt & shifting & started shifting & can refine, 564048b5e81SShri Abhyankar * then try lower shift 565048b5e81SShri Abhyankar */ 566048b5e81SShri Abhyankar sctx.shift_hi = sctx.shift_fraction; 567048b5e81SShri Abhyankar sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.; 568048b5e81SShri Abhyankar sctx.shift_amount = sctx.shift_fraction * sctx.shift_top; 569048b5e81SShri Abhyankar sctx.newshift = PETSC_TRUE; 570048b5e81SShri Abhyankar sctx.nshift++; 571048b5e81SShri Abhyankar } 572048b5e81SShri Abhyankar } while (sctx.newshift); 573048b5e81SShri Abhyankar 5749566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 5759566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 5769566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 577048b5e81SShri Abhyankar 5789566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 5799566063dSJacob Faibussowitsch PetscCall(ISIdentity(isicol, &col_identity)); 580048b5e81SShri Abhyankar if (row_identity && col_identity) { 581048b5e81SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 5829f5c0bcdSBarry Smith C->ops->forwardsolve = MatForwardSolve_SeqBAIJ_1_NaturalOrdering; 5839f5c0bcdSBarry Smith C->ops->backwardsolve = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering; 58493fd935bSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 585048b5e81SShri Abhyankar } else { 586048b5e81SShri Abhyankar C->ops->solve = MatSolve_SeqBAIJ_1; 58793fd935bSShri Abhyankar C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 588048b5e81SShri Abhyankar } 589048b5e81SShri Abhyankar C->assembled = PETSC_TRUE; 5909566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(C->cmap->n)); 591048b5e81SShri Abhyankar 592048b5e81SShri Abhyankar /* MatShiftView(A,info,&sctx) */ 593048b5e81SShri Abhyankar if (sctx.nshift) { 594048b5e81SShri Abhyankar if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 5959566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top)); 596048b5e81SShri Abhyankar } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 5979566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 598048b5e81SShri Abhyankar } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) { 5999566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount)); 600048b5e81SShri Abhyankar } 601048b5e81SShri Abhyankar } 6023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 603048b5e81SShri Abhyankar } 604048b5e81SShri Abhyankar 605d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info) 606d71ae5a4SJacob Faibussowitsch { 6077fc0212eSBarry Smith Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data; 6087cdf2dbbSSatish Balay IS isrow = b->row, isicol = b->icol; 6095d0c19d7SBarry Smith const PetscInt *r, *ic; 6105d0c19d7SBarry Smith PetscInt i, j, n = a->mbs, *bi = b->i, *bj = b->j; 611690b6cddSBarry Smith PetscInt *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j; 612690b6cddSBarry Smith PetscInt *diag_offset = b->diag, diag, *pj; 613329f5518SBarry Smith MatScalar *pv, *v, *rtmp, multiplier, *pc; 6143f1db9ecSBarry Smith MatScalar *ba = b->a, *aa = a->a; 615ace3abfcSBarry Smith PetscBool row_identity, col_identity; 6167fc0212eSBarry Smith 6173a40ed3dSBarry Smith PetscFunctionBegin; 6189566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 6199566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 6209566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(n + 1, &rtmp)); 6217fc0212eSBarry Smith 6227fc0212eSBarry Smith for (i = 0; i < n; i++) { 6234078e994SBarry Smith nz = bi[i + 1] - bi[i]; 6244078e994SBarry Smith ajtmp = bj + bi[i]; 6257fc0212eSBarry Smith for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0; 6267fc0212eSBarry Smith 6277fc0212eSBarry Smith /* load in initial (unfactored row) */ 6284078e994SBarry Smith nz = ai[r[i] + 1] - ai[r[i]]; 6294078e994SBarry Smith ajtmpold = aj + ai[r[i]]; 6304078e994SBarry Smith v = aa + ai[r[i]]; 6317fc0212eSBarry Smith for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j]; 6327fc0212eSBarry Smith 6337fc0212eSBarry Smith row = *ajtmp++; 6347fc0212eSBarry Smith while (row < i) { 6357fc0212eSBarry Smith pc = rtmp + row; 6367fc0212eSBarry Smith if (*pc != 0.0) { 6374078e994SBarry Smith pv = ba + diag_offset[row]; 6384078e994SBarry Smith pj = bj + diag_offset[row] + 1; 6397fc0212eSBarry Smith multiplier = *pc * *pv++; 6407fc0212eSBarry Smith *pc = multiplier; 6414078e994SBarry Smith nz = bi[row + 1] - diag_offset[row] - 1; 6427fc0212eSBarry Smith for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j]; 6439566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(1.0 + 2.0 * nz)); 6447fc0212eSBarry Smith } 6457fc0212eSBarry Smith row = *ajtmp++; 6467fc0212eSBarry Smith } 6477fc0212eSBarry Smith /* finished row so stick it into b->a */ 6484078e994SBarry Smith pv = ba + bi[i]; 6494078e994SBarry Smith pj = bj + bi[i]; 6504078e994SBarry Smith nz = bi[i + 1] - bi[i]; 65126fbe8dcSKarl Rupp for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]]; 6524078e994SBarry Smith diag = diag_offset[i] - bi[i]; 6537fc0212eSBarry Smith /* check pivot entry for current row */ 65408401ef6SPierre Jolivet PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 6557fc0212eSBarry Smith pv[diag] = 1.0 / pv[diag]; 6567fc0212eSBarry Smith } 6577fc0212eSBarry Smith 6589566063dSJacob Faibussowitsch PetscCall(PetscFree(rtmp)); 6599566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 6609566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 6619566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 6629566063dSJacob Faibussowitsch PetscCall(ISIdentity(isicol, &col_identity)); 663f58c8c74SBarry Smith if (row_identity && col_identity) { 66406e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace; 66506e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace; 666f58c8c74SBarry Smith } else { 66706e38f1dSHong Zhang C->ops->solve = MatSolve_SeqBAIJ_1_inplace; 66806e38f1dSHong Zhang C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace; 669f58c8c74SBarry Smith } 6707fc0212eSBarry Smith C->assembled = PETSC_TRUE; 6719566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(C->cmap->n)); 6723ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6737fc0212eSBarry Smith } 6747fc0212eSBarry Smith 675d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type) 676d71ae5a4SJacob Faibussowitsch { 6774ac6704cSBarry Smith PetscFunctionBegin; 6784ac6704cSBarry Smith *type = MATSOLVERPETSC; 6793ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6804ac6704cSBarry Smith } 6814ac6704cSBarry Smith 682d71ae5a4SJacob Faibussowitsch PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B) 683d71ae5a4SJacob Faibussowitsch { 684d0f46423SBarry Smith PetscInt n = A->rmap->n; 685b24902e0SBarry Smith 686b24902e0SBarry Smith PetscFunctionBegin; 687b499a2aeSHong Zhang #if defined(PETSC_USE_COMPLEX) 688b94d7dedSBarry Smith PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported"); 689b499a2aeSHong Zhang #endif 6909566063dSJacob Faibussowitsch PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B)); 6919566063dSJacob Faibussowitsch PetscCall(MatSetSizes(*B, n, n, n, n)); 692c8342467SHong Zhang if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) { 6939566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQBAIJ)); 69426fbe8dcSKarl Rupp 6954dd39f65SShri Abhyankar (*B)->ops->lufactorsymbolic = MatLUFactorSymbolic_SeqBAIJ; 6964dd39f65SShri Abhyankar (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ; 6979566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU])); 6989566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU])); 6999566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT])); 700b24902e0SBarry Smith } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) { 7019566063dSJacob Faibussowitsch PetscCall(MatSetType(*B, MATSEQSBAIJ)); 7029566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL)); 70326fbe8dcSKarl Rupp 7045c9eb25fSBarry Smith (*B)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqBAIJ; 7055c9eb25fSBarry Smith (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ; 7064ac6704cSBarry Smith /* Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */ 7079566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY])); 7089566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC])); 709e32f2f54SBarry Smith } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported"); 710d5f3da31SBarry Smith (*B)->factortype = ftype; 711f73b0415SBarry Smith (*B)->canuseordering = PETSC_TRUE; 71200c67f3bSHong Zhang 7139566063dSJacob Faibussowitsch PetscCall(PetscFree((*B)->solvertype)); 7149566063dSJacob Faibussowitsch PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype)); 7159566063dSJacob Faibussowitsch PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc)); 7163ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 717b24902e0SBarry Smith } 718273d9f13SBarry Smith 7195d517e7eSBarry Smith /* ----------------------------------------------------------- */ 720d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info) 721d71ae5a4SJacob Faibussowitsch { 7225d517e7eSBarry Smith Mat C; 7235d517e7eSBarry Smith 7243a40ed3dSBarry Smith PetscFunctionBegin; 7259566063dSJacob Faibussowitsch PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C)); 7269566063dSJacob Faibussowitsch PetscCall(MatLUFactorSymbolic(C, A, row, col, info)); 7279566063dSJacob Faibussowitsch PetscCall(MatLUFactorNumeric(C, A, info)); 72826fbe8dcSKarl Rupp 729db4efbfdSBarry Smith A->ops->solve = C->ops->solve; 730db4efbfdSBarry Smith A->ops->solvetranspose = C->ops->solvetranspose; 73126fbe8dcSKarl Rupp 7329566063dSJacob Faibussowitsch PetscCall(MatHeaderMerge(A, &C)); 7333ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 7345d517e7eSBarry Smith } 7354cd38560SBarry Smith 736c6db04a5SJed Brown #include <../src/mat/impls/sbaij/seq/sbaij.h> 737d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info) 738d71ae5a4SJacob Faibussowitsch { 73978910aadSHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 74078910aadSHong Zhang Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 74178910aadSHong Zhang IS ip = b->row; 7425d0c19d7SBarry Smith const PetscInt *rip; 7435d0c19d7SBarry Smith PetscInt i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol; 74478910aadSHong Zhang PetscInt *ai = a->i, *aj = a->j; 74578910aadSHong Zhang PetscInt k, jmin, jmax, *jl, *il, col, nexti, ili, nz; 74678910aadSHong Zhang MatScalar *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi; 74707b50cabSHong Zhang PetscReal rs; 7480e95ead3SHong Zhang FactorShiftCtx sctx; 74978910aadSHong Zhang 750c05c3958SHong Zhang PetscFunctionBegin; 75107b50cabSHong Zhang if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */ 75248a46eb9SPierre Jolivet if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 7539566063dSJacob Faibussowitsch PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info)); 7549566063dSJacob Faibussowitsch PetscCall(MatDestroy(&a->sbaijMat)); 7553ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 7566ad2eaddSHong Zhang } 75778910aadSHong Zhang 75807b50cabSHong Zhang /* MatPivotSetUp(): initialize shift context sctx */ 7599566063dSJacob Faibussowitsch PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 7603cea4cbeSHong Zhang 7619566063dSJacob Faibussowitsch PetscCall(ISGetIndices(ip, &rip)); 7629566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl)); 76378910aadSHong Zhang 76475567043SBarry Smith sctx.shift_amount = 0.; 7653cea4cbeSHong Zhang sctx.nshift = 0; 76678910aadSHong Zhang do { 76707b50cabSHong Zhang sctx.newshift = PETSC_FALSE; 76878910aadSHong Zhang for (i = 0; i < mbs; i++) { 7699371c9d4SSatish Balay rtmp[i] = 0.0; 7709371c9d4SSatish Balay jl[i] = mbs; 7719371c9d4SSatish Balay il[0] = 0; 77278910aadSHong Zhang } 77378910aadSHong Zhang 77478910aadSHong Zhang for (k = 0; k < mbs; k++) { 77578910aadSHong Zhang bval = ba + bi[k]; 77678910aadSHong Zhang /* initialize k-th row by the perm[k]-th row of A */ 7779371c9d4SSatish Balay jmin = ai[rip[k]]; 7789371c9d4SSatish Balay jmax = ai[rip[k] + 1]; 77978910aadSHong Zhang for (j = jmin; j < jmax; j++) { 78078910aadSHong Zhang col = rip[aj[j]]; 78178910aadSHong Zhang if (col >= k) { /* only take upper triangular entry */ 78278910aadSHong Zhang rtmp[col] = aa[j]; 78378910aadSHong Zhang *bval++ = 0.0; /* for in-place factorization */ 78478910aadSHong Zhang } 78578910aadSHong Zhang } 7863cea4cbeSHong Zhang 7873cea4cbeSHong Zhang /* shift the diagonal of the matrix */ 7883cea4cbeSHong Zhang if (sctx.nshift) rtmp[k] += sctx.shift_amount; 78978910aadSHong Zhang 79078910aadSHong Zhang /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 79178910aadSHong Zhang dk = rtmp[k]; 79278910aadSHong Zhang i = jl[k]; /* first row to be added to k_th row */ 79378910aadSHong Zhang 79478910aadSHong Zhang while (i < k) { 79578910aadSHong Zhang nexti = jl[i]; /* next row to be added to k_th row */ 79678910aadSHong Zhang 79778910aadSHong Zhang /* compute multiplier, update diag(k) and U(i,k) */ 79878910aadSHong Zhang ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 79978910aadSHong Zhang uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */ 80078910aadSHong Zhang dk += uikdi * ba[ili]; 80178910aadSHong Zhang ba[ili] = uikdi; /* -U(i,k) */ 80278910aadSHong Zhang 80378910aadSHong Zhang /* add multiple of row i to k-th row */ 8049371c9d4SSatish Balay jmin = ili + 1; 8059371c9d4SSatish Balay jmax = bi[i + 1]; 80678910aadSHong Zhang if (jmin < jmax) { 80778910aadSHong Zhang for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j]; 80878910aadSHong Zhang /* update il and jl for row i */ 80978910aadSHong Zhang il[i] = jmin; 8109371c9d4SSatish Balay j = bj[jmin]; 8119371c9d4SSatish Balay jl[i] = jl[j]; 8129371c9d4SSatish Balay jl[j] = i; 81378910aadSHong Zhang } 81478910aadSHong Zhang i = nexti; 81578910aadSHong Zhang } 81678910aadSHong Zhang 8173cea4cbeSHong Zhang /* shift the diagonals when zero pivot is detected */ 8183cea4cbeSHong Zhang /* compute rs=sum of abs(off-diagonal) */ 8193cea4cbeSHong Zhang rs = 0.0; 82078910aadSHong Zhang jmin = bi[k] + 1; 82178910aadSHong Zhang nz = bi[k + 1] - jmin; 82278910aadSHong Zhang if (nz) { 82378910aadSHong Zhang bcol = bj + jmin; 82478910aadSHong Zhang while (nz--) { 82589f845c8SHong Zhang rs += PetscAbsScalar(rtmp[*bcol]); 82689f845c8SHong Zhang bcol++; 82778910aadSHong Zhang } 82878910aadSHong Zhang } 8293cea4cbeSHong Zhang 8303cea4cbeSHong Zhang sctx.rs = rs; 8313cea4cbeSHong Zhang sctx.pv = dk; 8329566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 83307b50cabSHong Zhang if (sctx.newshift) break; 8340e95ead3SHong Zhang dk = sctx.pv; 83578910aadSHong Zhang 83678910aadSHong Zhang /* copy data into U(k,:) */ 83778910aadSHong Zhang ba[bi[k]] = 1.0 / dk; /* U(k,k) */ 8389371c9d4SSatish Balay jmin = bi[k] + 1; 8399371c9d4SSatish Balay jmax = bi[k + 1]; 84078910aadSHong Zhang if (jmin < jmax) { 84178910aadSHong Zhang for (j = jmin; j < jmax; j++) { 8429371c9d4SSatish Balay col = bj[j]; 8439371c9d4SSatish Balay ba[j] = rtmp[col]; 8449371c9d4SSatish Balay rtmp[col] = 0.0; 84578910aadSHong Zhang } 84678910aadSHong Zhang /* add the k-th row into il and jl */ 84778910aadSHong Zhang il[k] = jmin; 8489371c9d4SSatish Balay i = bj[jmin]; 8499371c9d4SSatish Balay jl[k] = jl[i]; 8509371c9d4SSatish Balay jl[i] = k; 85178910aadSHong Zhang } 85278910aadSHong Zhang } 85307b50cabSHong Zhang } while (sctx.newshift); 8549566063dSJacob Faibussowitsch PetscCall(PetscFree3(rtmp, il, jl)); 85578910aadSHong Zhang 8569566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(ip, &rip)); 85726fbe8dcSKarl Rupp 85878910aadSHong Zhang C->assembled = PETSC_TRUE; 85978910aadSHong Zhang C->preallocated = PETSC_TRUE; 86026fbe8dcSKarl Rupp 8619566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(C->rmap->N)); 8623cea4cbeSHong Zhang if (sctx.nshift) { 863f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 8649566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 865f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 8669566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 86778910aadSHong Zhang } 86878910aadSHong Zhang } 8693ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 870c05c3958SHong Zhang } 8714cd38560SBarry Smith 872d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) 873d71ae5a4SJacob Faibussowitsch { 87478910aadSHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 87578910aadSHong Zhang Mat_SeqSBAIJ *b = (Mat_SeqSBAIJ *)C->data; 8763cea4cbeSHong Zhang PetscInt i, j, am = a->mbs; 87778910aadSHong Zhang PetscInt *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j; 8783cea4cbeSHong Zhang PetscInt k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz; 87978910aadSHong Zhang MatScalar *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval; 8800e95ead3SHong Zhang PetscReal rs; 8810e95ead3SHong Zhang FactorShiftCtx sctx; 88278910aadSHong Zhang 883c05c3958SHong Zhang PetscFunctionBegin; 8840e95ead3SHong Zhang /* MatPivotSetUp(): initialize shift context sctx */ 8859566063dSJacob Faibussowitsch PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx))); 8863cea4cbeSHong Zhang 8879566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl)); 88878910aadSHong Zhang 88978910aadSHong Zhang do { 89007b50cabSHong Zhang sctx.newshift = PETSC_FALSE; 89178910aadSHong Zhang for (i = 0; i < am; i++) { 8929371c9d4SSatish Balay rtmp[i] = 0.0; 8939371c9d4SSatish Balay jl[i] = am; 8949371c9d4SSatish Balay il[0] = 0; 89578910aadSHong Zhang } 89678910aadSHong Zhang 89778910aadSHong Zhang for (k = 0; k < am; k++) { 89878910aadSHong Zhang /* initialize k-th row with elements nonzero in row perm(k) of A */ 89978910aadSHong Zhang nz = ai[k + 1] - ai[k]; 90078910aadSHong Zhang acol = aj + ai[k]; 90178910aadSHong Zhang aval = aa + ai[k]; 90278910aadSHong Zhang bval = ba + bi[k]; 90378910aadSHong Zhang while (nz--) { 90478910aadSHong Zhang if (*acol < k) { /* skip lower triangular entries */ 9059371c9d4SSatish Balay acol++; 9069371c9d4SSatish Balay aval++; 90778910aadSHong Zhang } else { 90878910aadSHong Zhang rtmp[*acol++] = *aval++; 90978910aadSHong Zhang *bval++ = 0.0; /* for in-place factorization */ 91078910aadSHong Zhang } 91178910aadSHong Zhang } 9123cea4cbeSHong Zhang 9133cea4cbeSHong Zhang /* shift the diagonal of the matrix */ 9143cea4cbeSHong Zhang if (sctx.nshift) rtmp[k] += sctx.shift_amount; 91578910aadSHong Zhang 91678910aadSHong Zhang /* modify k-th row by adding in those rows i with U(i,k)!=0 */ 91778910aadSHong Zhang dk = rtmp[k]; 91878910aadSHong Zhang i = jl[k]; /* first row to be added to k_th row */ 91978910aadSHong Zhang 92078910aadSHong Zhang while (i < k) { 92178910aadSHong Zhang nexti = jl[i]; /* next row to be added to k_th row */ 92278910aadSHong Zhang /* compute multiplier, update D(k) and U(i,k) */ 92378910aadSHong Zhang ili = il[i]; /* index of first nonzero element in U(i,k:bms-1) */ 92478910aadSHong Zhang uikdi = -ba[ili] * ba[bi[i]]; 92578910aadSHong Zhang dk += uikdi * ba[ili]; 92678910aadSHong Zhang ba[ili] = uikdi; /* -U(i,k) */ 92778910aadSHong Zhang 92878910aadSHong Zhang /* add multiple of row i to k-th row ... */ 92978910aadSHong Zhang jmin = ili + 1; 93078910aadSHong Zhang nz = bi[i + 1] - jmin; 93178910aadSHong Zhang if (nz > 0) { 93278910aadSHong Zhang bcol = bj + jmin; 93378910aadSHong Zhang bval = ba + jmin; 93478910aadSHong Zhang while (nz--) rtmp[*bcol++] += uikdi * (*bval++); 93578910aadSHong Zhang /* update il and jl for i-th row */ 93678910aadSHong Zhang il[i] = jmin; 9379371c9d4SSatish Balay j = bj[jmin]; 9389371c9d4SSatish Balay jl[i] = jl[j]; 9399371c9d4SSatish Balay jl[j] = i; 94078910aadSHong Zhang } 94178910aadSHong Zhang i = nexti; 94278910aadSHong Zhang } 94378910aadSHong Zhang 9443cea4cbeSHong Zhang /* shift the diagonals when zero pivot is detected */ 9453cea4cbeSHong Zhang /* compute rs=sum of abs(off-diagonal) */ 9463cea4cbeSHong Zhang rs = 0.0; 94778910aadSHong Zhang jmin = bi[k] + 1; 94878910aadSHong Zhang nz = bi[k + 1] - jmin; 94978910aadSHong Zhang if (nz) { 95078910aadSHong Zhang bcol = bj + jmin; 95178910aadSHong Zhang while (nz--) { 95289f845c8SHong Zhang rs += PetscAbsScalar(rtmp[*bcol]); 95389f845c8SHong Zhang bcol++; 95478910aadSHong Zhang } 95578910aadSHong Zhang } 9563cea4cbeSHong Zhang 9573cea4cbeSHong Zhang sctx.rs = rs; 9583cea4cbeSHong Zhang sctx.pv = dk; 9599566063dSJacob Faibussowitsch PetscCall(MatPivotCheck(C, A, info, &sctx, k)); 96007b50cabSHong Zhang if (sctx.newshift) break; /* sctx.shift_amount is updated */ 9610e95ead3SHong Zhang dk = sctx.pv; 96278910aadSHong Zhang 96378910aadSHong Zhang /* copy data into U(k,:) */ 96478910aadSHong Zhang ba[bi[k]] = 1.0 / dk; 96578910aadSHong Zhang jmin = bi[k] + 1; 96678910aadSHong Zhang nz = bi[k + 1] - jmin; 96778910aadSHong Zhang if (nz) { 96878910aadSHong Zhang bcol = bj + jmin; 96978910aadSHong Zhang bval = ba + jmin; 97078910aadSHong Zhang while (nz--) { 97178910aadSHong Zhang *bval++ = rtmp[*bcol]; 97278910aadSHong Zhang rtmp[*bcol++] = 0.0; 97378910aadSHong Zhang } 97478910aadSHong Zhang /* add k-th row into il and jl */ 97578910aadSHong Zhang il[k] = jmin; 9769371c9d4SSatish Balay i = bj[jmin]; 9779371c9d4SSatish Balay jl[k] = jl[i]; 9789371c9d4SSatish Balay jl[i] = k; 97978910aadSHong Zhang } 98078910aadSHong Zhang } 98107b50cabSHong Zhang } while (sctx.newshift); 9829566063dSJacob Faibussowitsch PetscCall(PetscFree3(rtmp, il, jl)); 98378910aadSHong Zhang 9840a3c351aSHong Zhang C->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 9850a3c351aSHong Zhang C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 98678910aadSHong Zhang C->assembled = PETSC_TRUE; 98778910aadSHong Zhang C->preallocated = PETSC_TRUE; 98826fbe8dcSKarl Rupp 9899566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(C->rmap->N)); 9903cea4cbeSHong Zhang if (sctx.nshift) { 991f4db908eSBarry Smith if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) { 9929566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 993f4db908eSBarry Smith } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { 9949566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount)); 99578910aadSHong Zhang } 99678910aadSHong Zhang } 9973ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 998c05c3958SHong Zhang } 999c05c3958SHong Zhang 1000c6db04a5SJed Brown #include <petscbt.h> 1001c6db04a5SJed Brown #include <../src/mat/utils/freespace.h> 1002d71ae5a4SJacob Faibussowitsch PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1003d71ae5a4SJacob Faibussowitsch { 100478910aadSHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 100578910aadSHong Zhang Mat_SeqSBAIJ *b; 100678910aadSHong Zhang Mat B; 100748dd3d27SHong Zhang PetscBool perm_identity, missing; 10085d0c19d7SBarry Smith PetscInt reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui; 10095d0c19d7SBarry Smith const PetscInt *rip; 101078910aadSHong Zhang PetscInt jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow; 10110298fd71SBarry Smith PetscInt nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr; 101278910aadSHong Zhang PetscReal fill = info->fill, levels = info->levels; 10130298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 10140298fd71SBarry Smith PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL; 101578910aadSHong Zhang PetscBT lnkbt; 101678910aadSHong Zhang 1017c05c3958SHong Zhang PetscFunctionBegin; 10189566063dSJacob Faibussowitsch PetscCall(MatMissingDiagonal(A, &missing, &i)); 101928b400f6SJacob Faibussowitsch PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 102048dd3d27SHong Zhang 10216ad2eaddSHong Zhang if (bs > 1) { 102248a46eb9SPierre Jolivet if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1023719d5645SBarry Smith (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 102426fbe8dcSKarl Rupp 10259566063dSJacob Faibussowitsch PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info)); 10263ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 10276ad2eaddSHong Zhang } 10286ad2eaddSHong Zhang 10299566063dSJacob Faibussowitsch PetscCall(ISIdentity(perm, &perm_identity)); 10309566063dSJacob Faibussowitsch PetscCall(ISGetIndices(perm, &rip)); 103178910aadSHong Zhang 103278910aadSHong Zhang /* special case that simply copies fill pattern */ 103378910aadSHong Zhang if (!levels && perm_identity) { 10349566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 1, &ui)); 103526fbe8dcSKarl Rupp for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */ 1036719d5645SBarry Smith B = fact; 10379566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui)); 103878910aadSHong Zhang 103978910aadSHong Zhang b = (Mat_SeqSBAIJ *)B->data; 104078910aadSHong Zhang uj = b->j; 104178910aadSHong Zhang for (i = 0; i < am; i++) { 104278910aadSHong Zhang aj = a->j + a->diag[i]; 104326fbe8dcSKarl Rupp for (j = 0; j < ui[i]; j++) *uj++ = *aj++; 104478910aadSHong Zhang b->ilen[i] = ui[i]; 104578910aadSHong Zhang } 10469566063dSJacob Faibussowitsch PetscCall(PetscFree(ui)); 104726fbe8dcSKarl Rupp 1048d5f3da31SBarry Smith B->factortype = MAT_FACTOR_NONE; 104926fbe8dcSKarl Rupp 10509566063dSJacob Faibussowitsch PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY)); 10519566063dSJacob Faibussowitsch PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY)); 1052d5f3da31SBarry Smith B->factortype = MAT_FACTOR_ICC; 105378910aadSHong Zhang 105478910aadSHong Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 10553ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 105678910aadSHong Zhang } 105778910aadSHong Zhang 105878910aadSHong Zhang /* initialization */ 10599566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 1, &ui)); 1060e60cf9a0SBarry Smith ui[0] = 0; 10619566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl)); 106278910aadSHong Zhang 106378910aadSHong Zhang /* jl: linked list for storing indices of the pivot rows 106478910aadSHong Zhang il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */ 10659566063dSJacob Faibussowitsch PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl)); 106678910aadSHong Zhang for (i = 0; i < am; i++) { 10679371c9d4SSatish Balay jl[i] = am; 10689371c9d4SSatish Balay il[i] = 0; 106978910aadSHong Zhang } 107078910aadSHong Zhang 107178910aadSHong Zhang /* create and initialize a linked list for storing column indices of the active row k */ 107278910aadSHong Zhang nlnk = am + 1; 10739566063dSJacob Faibussowitsch PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt)); 107478910aadSHong Zhang 107595b5ac22SHong Zhang /* initial FreeSpace size is fill*(ai[am]+am)/2 */ 10769566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space)); 107726fbe8dcSKarl Rupp 107878910aadSHong Zhang current_space = free_space; 107926fbe8dcSKarl Rupp 10809566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl)); 108178910aadSHong Zhang current_space_lvl = free_space_lvl; 108278910aadSHong Zhang 108378910aadSHong Zhang for (k = 0; k < am; k++) { /* for each active row k */ 108478910aadSHong Zhang /* initialize lnk by the column indices of row rip[k] of A */ 108578910aadSHong Zhang nzk = 0; 108678910aadSHong Zhang ncols = ai[rip[k] + 1] - ai[rip[k]]; 108778910aadSHong Zhang ncols_upper = 0; 108878910aadSHong Zhang cols = cols_lvl + am; 108978910aadSHong Zhang for (j = 0; j < ncols; j++) { 109078910aadSHong Zhang i = rip[*(aj + ai[rip[k]] + j)]; 109178910aadSHong Zhang if (i >= k) { /* only take upper triangular entry */ 109278910aadSHong Zhang cols[ncols_upper] = i; 109378910aadSHong Zhang cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */ 109478910aadSHong Zhang ncols_upper++; 109578910aadSHong Zhang } 109678910aadSHong Zhang } 10979566063dSJacob Faibussowitsch PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 109878910aadSHong Zhang nzk += nlnk; 109978910aadSHong Zhang 110078910aadSHong Zhang /* update lnk by computing fill-in for each pivot row to be merged in */ 110178910aadSHong Zhang prow = jl[k]; /* 1st pivot row */ 110278910aadSHong Zhang 110378910aadSHong Zhang while (prow < k) { 110478910aadSHong Zhang nextprow = jl[prow]; 110578910aadSHong Zhang 110678910aadSHong Zhang /* merge prow into k-th row */ 110778910aadSHong Zhang jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */ 110878910aadSHong Zhang jmax = ui[prow + 1]; 110978910aadSHong Zhang ncols = jmax - jmin; 111078910aadSHong Zhang i = jmin - ui[prow]; 111178910aadSHong Zhang cols = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */ 111278910aadSHong Zhang for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j); 11139566063dSJacob Faibussowitsch PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt)); 111478910aadSHong Zhang nzk += nlnk; 111578910aadSHong Zhang 111678910aadSHong Zhang /* update il and jl for prow */ 111778910aadSHong Zhang if (jmin < jmax) { 111878910aadSHong Zhang il[prow] = jmin; 111926fbe8dcSKarl Rupp 11209371c9d4SSatish Balay j = *cols; 11219371c9d4SSatish Balay jl[prow] = jl[j]; 11229371c9d4SSatish Balay jl[j] = prow; 112378910aadSHong Zhang } 112478910aadSHong Zhang prow = nextprow; 112578910aadSHong Zhang } 112678910aadSHong Zhang 112778910aadSHong Zhang /* if free space is not available, make more free space */ 112878910aadSHong Zhang if (current_space->local_remaining < nzk) { 112978910aadSHong Zhang i = am - k + 1; /* num of unfactored rows */ 1130f91af8c7SBarry Smith i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 11319566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 11329566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(i, ¤t_space_lvl)); 113378910aadSHong Zhang reallocs++; 113478910aadSHong Zhang } 113578910aadSHong Zhang 113678910aadSHong Zhang /* copy data into free_space and free_space_lvl, then initialize lnk */ 11379566063dSJacob Faibussowitsch PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt)); 113878910aadSHong Zhang 113978910aadSHong Zhang /* add the k-th row into il and jl */ 114078910aadSHong Zhang if (nzk - 1 > 0) { 114178910aadSHong Zhang i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */ 11429371c9d4SSatish Balay jl[k] = jl[i]; 11439371c9d4SSatish Balay jl[i] = k; 114478910aadSHong Zhang il[k] = ui[k] + 1; 114578910aadSHong Zhang } 114678910aadSHong Zhang uj_ptr[k] = current_space->array; 114778910aadSHong Zhang uj_lvl_ptr[k] = current_space_lvl->array; 114878910aadSHong Zhang 114978910aadSHong Zhang current_space->array += nzk; 115078910aadSHong Zhang current_space->local_used += nzk; 115178910aadSHong Zhang current_space->local_remaining -= nzk; 115278910aadSHong Zhang 115378910aadSHong Zhang current_space_lvl->array += nzk; 115478910aadSHong Zhang current_space_lvl->local_used += nzk; 115578910aadSHong Zhang current_space_lvl->local_remaining -= nzk; 115678910aadSHong Zhang 115778910aadSHong Zhang ui[k + 1] = ui[k] + nzk; 115878910aadSHong Zhang } 115978910aadSHong Zhang 11609566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(perm, &rip)); 11619566063dSJacob Faibussowitsch PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl)); 11629566063dSJacob Faibussowitsch PetscCall(PetscFree(cols_lvl)); 116378910aadSHong Zhang 11649263d837SHong Zhang /* copy free_space into uj and free free_space; set uj in new datastructure; */ 11659566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ui[am] + 1, &uj)); 11669566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 11679566063dSJacob Faibussowitsch PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt)); 11689566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceDestroy(free_space_lvl)); 116978910aadSHong Zhang 117078910aadSHong Zhang /* put together the new matrix in MATSEQSBAIJ format */ 1171719d5645SBarry Smith B = fact; 11729566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL)); 117378910aadSHong Zhang 117478910aadSHong Zhang b = (Mat_SeqSBAIJ *)B->data; 117578910aadSHong Zhang b->singlemalloc = PETSC_FALSE; 1176e6b907acSBarry Smith b->free_a = PETSC_TRUE; 1177e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 117826fbe8dcSKarl Rupp 11799566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ui[am] + 1, &b->a)); 118026fbe8dcSKarl Rupp 118178910aadSHong Zhang b->j = uj; 118278910aadSHong Zhang b->i = ui; 1183f4259b30SLisandro Dalcin b->diag = NULL; 1184f4259b30SLisandro Dalcin b->ilen = NULL; 1185f4259b30SLisandro Dalcin b->imax = NULL; 118678910aadSHong Zhang b->row = perm; 118778910aadSHong Zhang b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 118826fbe8dcSKarl Rupp 11899566063dSJacob Faibussowitsch PetscCall(PetscObjectReference((PetscObject)perm)); 119026fbe8dcSKarl Rupp 119178910aadSHong Zhang b->icol = perm; 119226fbe8dcSKarl Rupp 11939566063dSJacob Faibussowitsch PetscCall(PetscObjectReference((PetscObject)perm)); 11949566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(am + 1, &b->solve_work)); 119526fbe8dcSKarl Rupp 119678910aadSHong Zhang b->maxnz = b->nz = ui[am]; 119778910aadSHong Zhang 119878910aadSHong Zhang B->info.factor_mallocs = reallocs; 119978910aadSHong Zhang B->info.fill_ratio_given = fill; 120075567043SBarry Smith if (ai[am] != 0.) { 1201da81f932SPierre Jolivet /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */ 120295b5ac22SHong Zhang B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am); 120378910aadSHong Zhang } else { 120478910aadSHong Zhang B->info.fill_ratio_needed = 0.0; 120578910aadSHong Zhang } 12069263d837SHong Zhang #if defined(PETSC_USE_INFO) 12079263d837SHong Zhang if (ai[am] != 0) { 120895b5ac22SHong Zhang PetscReal af = B->info.fill_ratio_needed; 12099566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 12109566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 12119566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 12129263d837SHong Zhang } else { 12139566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Empty matrix\n")); 12149263d837SHong Zhang } 12159263d837SHong Zhang #endif 121678910aadSHong Zhang if (perm_identity) { 12170a3c351aSHong Zhang B->ops->solve = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 12180a3c351aSHong Zhang B->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace; 121978910aadSHong Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 122078910aadSHong Zhang } else { 1221719d5645SBarry Smith (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 122278910aadSHong Zhang } 12233ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1224c05c3958SHong Zhang } 1225c05c3958SHong Zhang 1226d71ae5a4SJacob Faibussowitsch PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) 1227d71ae5a4SJacob Faibussowitsch { 122878910aadSHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 122978910aadSHong Zhang Mat_SeqSBAIJ *b; 123078910aadSHong Zhang Mat B; 12319186b105SHong Zhang PetscBool perm_identity, missing; 123278910aadSHong Zhang PetscReal fill = info->fill; 12335d0c19d7SBarry Smith const PetscInt *rip; 12345d0c19d7SBarry Smith PetscInt i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow; 123578910aadSHong Zhang PetscInt *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow; 123678910aadSHong Zhang PetscInt nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr; 12370298fd71SBarry Smith PetscFreeSpaceList free_space = NULL, current_space = NULL; 123878910aadSHong Zhang PetscBT lnkbt; 123978910aadSHong Zhang 1240c05c3958SHong Zhang PetscFunctionBegin; 12416ad2eaddSHong Zhang if (bs > 1) { /* convert to seqsbaij */ 124248a46eb9SPierre Jolivet if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat)); 1243719d5645SBarry Smith (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */ 124426fbe8dcSKarl Rupp 12459566063dSJacob Faibussowitsch PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info)); 12463ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 12476ad2eaddSHong Zhang } 12486ad2eaddSHong Zhang 12499566063dSJacob Faibussowitsch PetscCall(MatMissingDiagonal(A, &missing, &i)); 125028b400f6SJacob Faibussowitsch PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 12519186b105SHong Zhang 125278910aadSHong Zhang /* check whether perm is the identity mapping */ 12539566063dSJacob Faibussowitsch PetscCall(ISIdentity(perm, &perm_identity)); 125428b400f6SJacob Faibussowitsch PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported"); 12559566063dSJacob Faibussowitsch PetscCall(ISGetIndices(perm, &rip)); 125678910aadSHong Zhang 125778910aadSHong Zhang /* initialization */ 12589566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(mbs + 1, &ui)); 1259e60cf9a0SBarry Smith ui[0] = 0; 126078910aadSHong Zhang 126178910aadSHong Zhang /* jl: linked list for storing indices of the pivot rows 126278910aadSHong Zhang il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */ 12639566063dSJacob Faibussowitsch PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols)); 126478910aadSHong Zhang for (i = 0; i < mbs; i++) { 12659371c9d4SSatish Balay jl[i] = mbs; 12669371c9d4SSatish Balay il[i] = 0; 126778910aadSHong Zhang } 126878910aadSHong Zhang 126978910aadSHong Zhang /* create and initialize a linked list for storing column indices of the active row k */ 127078910aadSHong Zhang nlnk = mbs + 1; 12719566063dSJacob Faibussowitsch PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 127278910aadSHong Zhang 12736a69fef8SHong Zhang /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */ 12749566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space)); 127526fbe8dcSKarl Rupp 127678910aadSHong Zhang current_space = free_space; 127778910aadSHong Zhang 127878910aadSHong Zhang for (k = 0; k < mbs; k++) { /* for each active row k */ 127978910aadSHong Zhang /* initialize lnk by the column indices of row rip[k] of A */ 128078910aadSHong Zhang nzk = 0; 128178910aadSHong Zhang ncols = ai[rip[k] + 1] - ai[rip[k]]; 1282e60cf9a0SBarry Smith ncols_upper = 0; 128378910aadSHong Zhang for (j = 0; j < ncols; j++) { 128478910aadSHong Zhang i = rip[*(aj + ai[rip[k]] + j)]; 128578910aadSHong Zhang if (i >= k) { /* only take upper triangular entry */ 128678910aadSHong Zhang cols[ncols_upper] = i; 128778910aadSHong Zhang ncols_upper++; 128878910aadSHong Zhang } 128978910aadSHong Zhang } 12909566063dSJacob Faibussowitsch PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt)); 129178910aadSHong Zhang nzk += nlnk; 129278910aadSHong Zhang 129378910aadSHong Zhang /* update lnk by computing fill-in for each pivot row to be merged in */ 129478910aadSHong Zhang prow = jl[k]; /* 1st pivot row */ 129578910aadSHong Zhang 129678910aadSHong Zhang while (prow < k) { 129778910aadSHong Zhang nextprow = jl[prow]; 129878910aadSHong Zhang /* merge prow into k-th row */ 129978910aadSHong Zhang jmin = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */ 130078910aadSHong Zhang jmax = ui[prow + 1]; 130178910aadSHong Zhang ncols = jmax - jmin; 130278910aadSHong Zhang uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */ 13039566063dSJacob Faibussowitsch PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt)); 130478910aadSHong Zhang nzk += nlnk; 130578910aadSHong Zhang 130678910aadSHong Zhang /* update il and jl for prow */ 130778910aadSHong Zhang if (jmin < jmax) { 130878910aadSHong Zhang il[prow] = jmin; 130926fbe8dcSKarl Rupp j = *uj_ptr; 131026fbe8dcSKarl Rupp jl[prow] = jl[j]; 131126fbe8dcSKarl Rupp jl[j] = prow; 131278910aadSHong Zhang } 131378910aadSHong Zhang prow = nextprow; 131478910aadSHong Zhang } 131578910aadSHong Zhang 131678910aadSHong Zhang /* if free space is not available, make more free space */ 131778910aadSHong Zhang if (current_space->local_remaining < nzk) { 131878910aadSHong Zhang i = mbs - k + 1; /* num of unfactored rows */ 1319f91af8c7SBarry Smith i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */ 13209566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceGet(i, ¤t_space)); 132178910aadSHong Zhang reallocs++; 132278910aadSHong Zhang } 132378910aadSHong Zhang 132478910aadSHong Zhang /* copy data into free space, then initialize lnk */ 13259566063dSJacob Faibussowitsch PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt)); 132678910aadSHong Zhang 132778910aadSHong Zhang /* add the k-th row into il and jl */ 132878910aadSHong Zhang if (nzk - 1 > 0) { 132978910aadSHong Zhang i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */ 13309371c9d4SSatish Balay jl[k] = jl[i]; 13319371c9d4SSatish Balay jl[i] = k; 133278910aadSHong Zhang il[k] = ui[k] + 1; 133378910aadSHong Zhang } 133478910aadSHong Zhang ui_ptr[k] = current_space->array; 133578910aadSHong Zhang current_space->array += nzk; 133678910aadSHong Zhang current_space->local_used += nzk; 133778910aadSHong Zhang current_space->local_remaining -= nzk; 133878910aadSHong Zhang 133978910aadSHong Zhang ui[k + 1] = ui[k] + nzk; 134078910aadSHong Zhang } 134178910aadSHong Zhang 13429566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(perm, &rip)); 13439566063dSJacob Faibussowitsch PetscCall(PetscFree4(ui_ptr, il, jl, cols)); 134478910aadSHong Zhang 13459263d837SHong Zhang /* copy free_space into uj and free free_space; set uj in new datastructure; */ 13469566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ui[mbs] + 1, &uj)); 13479566063dSJacob Faibussowitsch PetscCall(PetscFreeSpaceContiguous(&free_space, uj)); 13489566063dSJacob Faibussowitsch PetscCall(PetscLLDestroy(lnk, lnkbt)); 134978910aadSHong Zhang 135078910aadSHong Zhang /* put together the new matrix in MATSEQSBAIJ format */ 1351719d5645SBarry Smith B = fact; 13529566063dSJacob Faibussowitsch PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 135378910aadSHong Zhang 135478910aadSHong Zhang b = (Mat_SeqSBAIJ *)B->data; 135578910aadSHong Zhang b->singlemalloc = PETSC_FALSE; 1356e6b907acSBarry Smith b->free_a = PETSC_TRUE; 1357e6b907acSBarry Smith b->free_ij = PETSC_TRUE; 135826fbe8dcSKarl Rupp 13599566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a)); 136026fbe8dcSKarl Rupp 136178910aadSHong Zhang b->j = uj; 136278910aadSHong Zhang b->i = ui; 1363f4259b30SLisandro Dalcin b->diag = NULL; 1364f4259b30SLisandro Dalcin b->ilen = NULL; 1365f4259b30SLisandro Dalcin b->imax = NULL; 136678910aadSHong Zhang b->row = perm; 136778910aadSHong Zhang b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */ 136826fbe8dcSKarl Rupp 13699566063dSJacob Faibussowitsch PetscCall(PetscObjectReference((PetscObject)perm)); 137078910aadSHong Zhang b->icol = perm; 13719566063dSJacob Faibussowitsch PetscCall(PetscObjectReference((PetscObject)perm)); 13729566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(mbs + 1, &b->solve_work)); 137378910aadSHong Zhang b->maxnz = b->nz = ui[mbs]; 137478910aadSHong Zhang 137578910aadSHong Zhang B->info.factor_mallocs = reallocs; 137678910aadSHong Zhang B->info.fill_ratio_given = fill; 137775567043SBarry Smith if (ai[mbs] != 0.) { 137895b5ac22SHong Zhang /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */ 137995b5ac22SHong Zhang B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs); 138078910aadSHong Zhang } else { 138178910aadSHong Zhang B->info.fill_ratio_needed = 0.0; 138278910aadSHong Zhang } 13839263d837SHong Zhang #if defined(PETSC_USE_INFO) 13849263d837SHong Zhang if (ai[mbs] != 0.) { 13859263d837SHong Zhang PetscReal af = B->info.fill_ratio_needed; 13869566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af)); 13879566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af)); 13889566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af)); 13899263d837SHong Zhang } else { 13909566063dSJacob Faibussowitsch PetscCall(PetscInfo(A, "Empty matrix\n")); 13919263d837SHong Zhang } 13929263d837SHong Zhang #endif 139378910aadSHong Zhang if (perm_identity) { 13946ad2eaddSHong Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering; 139578910aadSHong Zhang } else { 13966ad2eaddSHong Zhang B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N; 139778910aadSHong Zhang } 13983ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1399c05c3958SHong Zhang } 1400c8342467SHong Zhang 1401d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx) 1402d71ae5a4SJacob Faibussowitsch { 14031a83e813SShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 14041a83e813SShri Abhyankar const PetscInt *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 14051a83e813SShri Abhyankar PetscInt i, k, n = a->mbs; 14061a83e813SShri Abhyankar PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1407d9ca1df4SBarry Smith const MatScalar *aa = a->a, *v; 1408d9ca1df4SBarry Smith PetscScalar *x, *s, *t, *ls; 1409d9ca1df4SBarry Smith const PetscScalar *b; 14101a83e813SShri Abhyankar 14111a83e813SShri Abhyankar PetscFunctionBegin; 14129566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 14139566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 14141a83e813SShri Abhyankar t = a->solve_work; 14151a83e813SShri Abhyankar 14161a83e813SShri Abhyankar /* forward solve the lower triangular */ 14179566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */ 14181a83e813SShri Abhyankar 14191a83e813SShri Abhyankar for (i = 1; i < n; i++) { 14201a83e813SShri Abhyankar v = aa + bs2 * ai[i]; 14211a83e813SShri Abhyankar vi = aj + ai[i]; 14221a83e813SShri Abhyankar nz = ai[i + 1] - ai[i]; 14231a83e813SShri Abhyankar s = t + bs * i; 14249566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */ 14251a83e813SShri Abhyankar for (k = 0; k < nz; k++) { 142696b95a6bSBarry Smith PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]); 14271a83e813SShri Abhyankar v += bs2; 14281a83e813SShri Abhyankar } 14291a83e813SShri Abhyankar } 14301a83e813SShri Abhyankar 14311a83e813SShri Abhyankar /* backward solve the upper triangular */ 14321a83e813SShri Abhyankar ls = a->solve_work + A->cmap->n; 14331a83e813SShri Abhyankar for (i = n - 1; i >= 0; i--) { 14341a83e813SShri Abhyankar v = aa + bs2 * (adiag[i + 1] + 1); 14351a83e813SShri Abhyankar vi = aj + adiag[i + 1] + 1; 14361a83e813SShri Abhyankar nz = adiag[i] - adiag[i + 1] - 1; 14379566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 14381a83e813SShri Abhyankar for (k = 0; k < nz; k++) { 143996b95a6bSBarry Smith PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]); 14401a83e813SShri Abhyankar v += bs2; 14411a83e813SShri Abhyankar } 144296b95a6bSBarry Smith PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */ 14439566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs)); 14441a83e813SShri Abhyankar } 14451a83e813SShri Abhyankar 14469566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 14479566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 14489566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 14493ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 14501a83e813SShri Abhyankar } 14511a83e813SShri Abhyankar 1452d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx) 1453d71ae5a4SJacob Faibussowitsch { 145435aa4fcfSShri Abhyankar Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 145535aa4fcfSShri Abhyankar IS iscol = a->col, isrow = a->row; 145635aa4fcfSShri Abhyankar const PetscInt *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi; 145735aa4fcfSShri Abhyankar PetscInt i, m, n = a->mbs; 145835aa4fcfSShri Abhyankar PetscInt nz, bs = A->rmap->bs, bs2 = a->bs2; 1459d9ca1df4SBarry Smith const MatScalar *aa = a->a, *v; 1460d9ca1df4SBarry Smith PetscScalar *x, *s, *t, *ls; 1461d9ca1df4SBarry Smith const PetscScalar *b; 146235aa4fcfSShri Abhyankar 146335aa4fcfSShri Abhyankar PetscFunctionBegin; 14649566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 14659566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 146635aa4fcfSShri Abhyankar t = a->solve_work; 146735aa4fcfSShri Abhyankar 14689371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 14699371c9d4SSatish Balay r = rout; 14709371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 14719371c9d4SSatish Balay c = cout; 147235aa4fcfSShri Abhyankar 147335aa4fcfSShri Abhyankar /* forward solve the lower triangular */ 14749566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t, b + bs * r[0], bs)); 147535aa4fcfSShri Abhyankar for (i = 1; i < n; i++) { 147635aa4fcfSShri Abhyankar v = aa + bs2 * ai[i]; 147735aa4fcfSShri Abhyankar vi = aj + ai[i]; 147835aa4fcfSShri Abhyankar nz = ai[i + 1] - ai[i]; 147935aa4fcfSShri Abhyankar s = t + bs * i; 14809566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(s, b + bs * r[i], bs)); 148135aa4fcfSShri Abhyankar for (m = 0; m < nz; m++) { 148296b95a6bSBarry Smith PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]); 148335aa4fcfSShri Abhyankar v += bs2; 148435aa4fcfSShri Abhyankar } 148535aa4fcfSShri Abhyankar } 148635aa4fcfSShri Abhyankar 148735aa4fcfSShri Abhyankar /* backward solve the upper triangular */ 148835aa4fcfSShri Abhyankar ls = a->solve_work + A->cmap->n; 148935aa4fcfSShri Abhyankar for (i = n - 1; i >= 0; i--) { 149035aa4fcfSShri Abhyankar v = aa + bs2 * (adiag[i + 1] + 1); 149135aa4fcfSShri Abhyankar vi = aj + adiag[i + 1] + 1; 149235aa4fcfSShri Abhyankar nz = adiag[i] - adiag[i + 1] - 1; 14939566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 149435aa4fcfSShri Abhyankar for (m = 0; m < nz; m++) { 149596b95a6bSBarry Smith PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]); 149635aa4fcfSShri Abhyankar v += bs2; 149735aa4fcfSShri Abhyankar } 149896b95a6bSBarry Smith PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */ 14999566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs)); 150035aa4fcfSShri Abhyankar } 15019566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 15029566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 15039566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 15049566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 15059566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 15063ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 150735aa4fcfSShri Abhyankar } 150835aa4fcfSShri Abhyankar 1509a25532f0SBarry Smith /* 1510a25532f0SBarry Smith For each block in an block array saves the largest absolute value in the block into another array 1511a25532f0SBarry Smith */ 1512d71ae5a4SJacob Faibussowitsch static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray) 1513d71ae5a4SJacob Faibussowitsch { 15142efa7f71SHong Zhang PetscInt i, j; 15155fd66863SKarl Rupp 15162efa7f71SHong Zhang PetscFunctionBegin; 15179566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(absarray, nbs + 1)); 15182efa7f71SHong Zhang for (i = 0; i < nbs; i++) { 15192efa7f71SHong Zhang for (j = 0; j < bs2; j++) { 15202efa7f71SHong Zhang if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]); 15212efa7f71SHong Zhang } 15222efa7f71SHong Zhang } 15233ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 15242efa7f71SHong Zhang } 15252efa7f71SHong Zhang 1526fe97e370SBarry Smith /* 1527fe97e370SBarry Smith This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used 1528fe97e370SBarry Smith */ 1529d71ae5a4SJacob Faibussowitsch PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact) 1530d71ae5a4SJacob Faibussowitsch { 15312efa7f71SHong Zhang Mat B = *fact; 15322efa7f71SHong Zhang Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b; 15332efa7f71SHong Zhang IS isicol; 15342efa7f71SHong Zhang const PetscInt *r, *ic; 15352efa7f71SHong Zhang PetscInt i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag; 15362efa7f71SHong Zhang PetscInt *bi, *bj, *bdiag; 15372efa7f71SHong Zhang 15382efa7f71SHong Zhang PetscInt row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au; 15392efa7f71SHong Zhang PetscInt nlnk, *lnk; 15402efa7f71SHong Zhang PetscBT lnkbt; 1541ace3abfcSBarry Smith PetscBool row_identity, icol_identity; 15422efa7f71SHong Zhang MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp; 15432efa7f71SHong Zhang PetscInt j, nz, *pj, *bjtmp, k, ncut, *jtmp; 15442efa7f71SHong Zhang 1545182b8fbaSHong Zhang PetscReal dt = info->dt; /* shift=info->shiftamount; */ 15462efa7f71SHong Zhang PetscInt nnz_max; 1547ace3abfcSBarry Smith PetscBool missing; 1548021822bcSHong Zhang PetscReal *vtmp_abs; 154997ef94ebSSatish Balay MatScalar *v_work; 155097ef94ebSSatish Balay PetscInt *v_pivots; 15515f8bbccaSHong Zhang PetscBool allowzeropivot, zeropivotdetected = PETSC_FALSE; 15522efa7f71SHong Zhang 1553c8342467SHong Zhang PetscFunctionBegin; 15542efa7f71SHong Zhang /* ------- symbolic factorization, can be reused ---------*/ 15559566063dSJacob Faibussowitsch PetscCall(MatMissingDiagonal(A, &missing, &i)); 155628b400f6SJacob Faibussowitsch PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i); 15572efa7f71SHong Zhang adiag = a->diag; 15582efa7f71SHong Zhang 15599566063dSJacob Faibussowitsch PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol)); 15602efa7f71SHong Zhang 15612efa7f71SHong Zhang /* bdiag is location of diagonal in factor */ 15629566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(mbs + 1, &bdiag)); 15632efa7f71SHong Zhang 15642efa7f71SHong Zhang /* allocate row pointers bi */ 15659566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(2 * mbs + 2, &bi)); 15662efa7f71SHong Zhang 15672efa7f71SHong Zhang /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */ 15682efa7f71SHong Zhang dtcount = (PetscInt)info->dtcount; 15696bce7ff8SHong Zhang if (dtcount > mbs - 1) dtcount = mbs - 1; 15706bce7ff8SHong Zhang nnz_max = ai[mbs] + 2 * mbs * dtcount + 2; 15716da515a1SHong Zhang /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */ 15729566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nnz_max, &bj)); 15736bce7ff8SHong Zhang nnz_max = nnz_max * bs2; 15749566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(nnz_max, &ba)); 15752efa7f71SHong Zhang 15762efa7f71SHong Zhang /* put together the new matrix */ 15779566063dSJacob Faibussowitsch PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL)); 157826fbe8dcSKarl Rupp 15792efa7f71SHong Zhang b = (Mat_SeqBAIJ *)(B)->data; 15802efa7f71SHong Zhang b->free_a = PETSC_TRUE; 15812efa7f71SHong Zhang b->free_ij = PETSC_TRUE; 15822efa7f71SHong Zhang b->singlemalloc = PETSC_FALSE; 158326fbe8dcSKarl Rupp 15842efa7f71SHong Zhang b->a = ba; 15852efa7f71SHong Zhang b->j = bj; 15862efa7f71SHong Zhang b->i = bi; 15872efa7f71SHong Zhang b->diag = bdiag; 1588f4259b30SLisandro Dalcin b->ilen = NULL; 1589f4259b30SLisandro Dalcin b->imax = NULL; 15902efa7f71SHong Zhang b->row = isrow; 15912efa7f71SHong Zhang b->col = iscol; 159226fbe8dcSKarl Rupp 15939566063dSJacob Faibussowitsch PetscCall(PetscObjectReference((PetscObject)isrow)); 15949566063dSJacob Faibussowitsch PetscCall(PetscObjectReference((PetscObject)iscol)); 159526fbe8dcSKarl Rupp 15962efa7f71SHong Zhang b->icol = isicol; 15979566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work)); 15982efa7f71SHong Zhang b->maxnz = nnz_max / bs2; 15992efa7f71SHong Zhang 1600d5f3da31SBarry Smith (B)->factortype = MAT_FACTOR_ILUDT; 16012efa7f71SHong Zhang (B)->info.factor_mallocs = 0; 16022efa7f71SHong Zhang (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2)); 16032efa7f71SHong Zhang /* ------- end of symbolic factorization ---------*/ 16049566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isrow, &r)); 16059566063dSJacob Faibussowitsch PetscCall(ISGetIndices(isicol, &ic)); 16062efa7f71SHong Zhang 16072efa7f71SHong Zhang /* linked list for storing column indices of the active row */ 16082efa7f71SHong Zhang nlnk = mbs + 1; 16099566063dSJacob Faibussowitsch PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt)); 16102efa7f71SHong Zhang 16112efa7f71SHong Zhang /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */ 16129566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp)); 16132efa7f71SHong Zhang /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */ 16149566063dSJacob Faibussowitsch PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp)); 16159566063dSJacob Faibussowitsch PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs)); 16169566063dSJacob Faibussowitsch PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots)); 16172efa7f71SHong Zhang 16185f8bbccaSHong Zhang allowzeropivot = PetscNot(A->erroriffailure); 1619e60cf9a0SBarry Smith bi[0] = 0; 16202efa7f71SHong Zhang bdiag[0] = (nnz_max / bs2) - 1; /* location of diagonal in factor B */ 16216bce7ff8SHong Zhang bi[2 * mbs + 1] = bdiag[0] + 1; /* endof bj and ba array */ 16222efa7f71SHong Zhang for (i = 0; i < mbs; i++) { 16232efa7f71SHong Zhang /* copy initial fill into linked list */ 16242efa7f71SHong Zhang nzi = ai[r[i] + 1] - ai[r[i]]; 162528b400f6SJacob Faibussowitsch PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i); 16262efa7f71SHong Zhang nzi_al = adiag[r[i]] - ai[r[i]]; 16272efa7f71SHong Zhang nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1; 16282efa7f71SHong Zhang 16292efa7f71SHong Zhang /* load in initial unfactored row */ 16302efa7f71SHong Zhang ajtmp = aj + ai[r[i]]; 16319566063dSJacob Faibussowitsch PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt)); 16329566063dSJacob Faibussowitsch PetscCall(PetscArrayzero(rtmp, mbs * bs2)); 16332efa7f71SHong Zhang aatmp = a->a + bs2 * ai[r[i]]; 16349566063dSJacob Faibussowitsch for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2)); 16352efa7f71SHong Zhang 16362efa7f71SHong Zhang /* add pivot rows into linked list */ 16372efa7f71SHong Zhang row = lnk[mbs]; 16382efa7f71SHong Zhang while (row < i) { 16392efa7f71SHong Zhang nzi_bl = bi[row + 1] - bi[row] + 1; 16402efa7f71SHong Zhang bjtmp = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */ 16419566063dSJacob Faibussowitsch PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im)); 16422efa7f71SHong Zhang nzi += nlnk; 16432efa7f71SHong Zhang row = lnk[row]; 16442efa7f71SHong Zhang } 16452efa7f71SHong Zhang 16462efa7f71SHong Zhang /* copy data from lnk into jtmp, then initialize lnk */ 16479566063dSJacob Faibussowitsch PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt)); 16482efa7f71SHong Zhang 16492efa7f71SHong Zhang /* numerical factorization */ 16502efa7f71SHong Zhang bjtmp = jtmp; 16512efa7f71SHong Zhang row = *bjtmp++; /* 1st pivot row */ 16522efa7f71SHong Zhang 16532efa7f71SHong Zhang while (row < i) { 16542efa7f71SHong Zhang pc = rtmp + bs2 * row; 16552efa7f71SHong Zhang pv = ba + bs2 * bdiag[row]; /* inv(diag) of the pivot row */ 165696b95a6bSBarry Smith PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */ 16579566063dSJacob Faibussowitsch PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs)); 16582efa7f71SHong Zhang if (vtmp_abs[0] > dt) { /* apply tolerance dropping rule */ 16592efa7f71SHong Zhang pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */ 16602efa7f71SHong Zhang pv = ba + bs2 * (bdiag[row + 1] + 1); 16612efa7f71SHong Zhang nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */ 1662ad540459SPierre Jolivet for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j); 16639566063dSJacob Faibussowitsch /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */ 16642efa7f71SHong Zhang } 16652efa7f71SHong Zhang row = *bjtmp++; 16662efa7f71SHong Zhang } 16672efa7f71SHong Zhang 16682efa7f71SHong Zhang /* copy sparse rtmp into contiguous vtmp; separate L and U part */ 16699371c9d4SSatish Balay nzi_bl = 0; 16709371c9d4SSatish Balay j = 0; 16712efa7f71SHong Zhang while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */ 16729566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 16739371c9d4SSatish Balay nzi_bl++; 16749371c9d4SSatish Balay j++; 16752efa7f71SHong Zhang } 16762efa7f71SHong Zhang nzi_bu = nzi - nzi_bl - 1; 16772efa7f71SHong Zhang 16782efa7f71SHong Zhang while (j < nzi) { /* U-part */ 16799566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2)); 16802efa7f71SHong Zhang j++; 16812efa7f71SHong Zhang } 16822efa7f71SHong Zhang 16839566063dSJacob Faibussowitsch PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs)); 16845f8bbccaSHong Zhang 16852efa7f71SHong Zhang bjtmp = bj + bi[i]; 16862efa7f71SHong Zhang batmp = ba + bs2 * bi[i]; 16872efa7f71SHong Zhang /* apply level dropping rule to L part */ 16882efa7f71SHong Zhang ncut = nzi_al + dtcount; 16892efa7f71SHong Zhang if (ncut < nzi_bl) { 16909566063dSJacob Faibussowitsch PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp)); 16919566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp)); 16922efa7f71SHong Zhang } else { 16932efa7f71SHong Zhang ncut = nzi_bl; 16942efa7f71SHong Zhang } 16952efa7f71SHong Zhang for (j = 0; j < ncut; j++) { 16962efa7f71SHong Zhang bjtmp[j] = jtmp[j]; 16979566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2)); 16982efa7f71SHong Zhang } 16992efa7f71SHong Zhang bi[i + 1] = bi[i] + ncut; 17002efa7f71SHong Zhang nzi = ncut + 1; 17012efa7f71SHong Zhang 17022efa7f71SHong Zhang /* apply level dropping rule to U part */ 17032efa7f71SHong Zhang ncut = nzi_au + dtcount; 17042efa7f71SHong Zhang if (ncut < nzi_bu) { 17059566063dSJacob Faibussowitsch PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1)); 17069566063dSJacob Faibussowitsch PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1)); 17072efa7f71SHong Zhang } else { 17082efa7f71SHong Zhang ncut = nzi_bu; 17092efa7f71SHong Zhang } 17102efa7f71SHong Zhang nzi += ncut; 17112efa7f71SHong Zhang 17122efa7f71SHong Zhang /* mark bdiagonal */ 17132efa7f71SHong Zhang bdiag[i + 1] = bdiag[i] - (ncut + 1); 17146bce7ff8SHong Zhang bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1); 17156bce7ff8SHong Zhang 17162efa7f71SHong Zhang bjtmp = bj + bdiag[i]; 17172efa7f71SHong Zhang batmp = ba + bs2 * bdiag[i]; 17189566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2)); 17192efa7f71SHong Zhang *bjtmp = i; 17205f8bbccaSHong Zhang 17212efa7f71SHong Zhang bjtmp = bj + bdiag[i + 1] + 1; 17222efa7f71SHong Zhang batmp = ba + (bdiag[i + 1] + 1) * bs2; 17232efa7f71SHong Zhang 17242efa7f71SHong Zhang for (k = 0; k < ncut; k++) { 17252efa7f71SHong Zhang bjtmp[k] = jtmp[nzi_bl + 1 + k]; 17269566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2)); 17272efa7f71SHong Zhang } 17282efa7f71SHong Zhang 17292efa7f71SHong Zhang im[i] = nzi; /* used by PetscLLAddSortedLU() */ 17302efa7f71SHong Zhang 1731a5b23f4aSJose E. Roman /* invert diagonal block for simpler triangular solves - add shift??? */ 17322efa7f71SHong Zhang batmp = ba + bs2 * bdiag[i]; 17335f8bbccaSHong Zhang 17349566063dSJacob Faibussowitsch PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected)); 17357b6c816cSBarry Smith if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT; 17362efa7f71SHong Zhang } /* for (i=0; i<mbs; i++) */ 17379566063dSJacob Faibussowitsch PetscCall(PetscFree3(v_work, multiplier, v_pivots)); 17382efa7f71SHong Zhang 17396da515a1SHong Zhang /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */ 174094bad497SJacob Faibussowitsch PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]); 17412efa7f71SHong Zhang 17429566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &r)); 17439566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isicol, &ic)); 17442efa7f71SHong Zhang 17459566063dSJacob Faibussowitsch PetscCall(PetscLLDestroy(lnk, lnkbt)); 17462efa7f71SHong Zhang 17479566063dSJacob Faibussowitsch PetscCall(PetscFree2(im, jtmp)); 17489566063dSJacob Faibussowitsch PetscCall(PetscFree2(rtmp, vtmp)); 17492efa7f71SHong Zhang 17509566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(bs2 * B->cmap->n)); 17512efa7f71SHong Zhang b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs]; 17522efa7f71SHong Zhang 17539566063dSJacob Faibussowitsch PetscCall(ISIdentity(isrow, &row_identity)); 17549566063dSJacob Faibussowitsch PetscCall(ISIdentity(isicol, &icol_identity)); 17552efa7f71SHong Zhang if (row_identity && icol_identity) { 17564dd39f65SShri Abhyankar B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 17572efa7f71SHong Zhang } else { 17584dd39f65SShri Abhyankar B->ops->solve = MatSolve_SeqBAIJ_N; 17592efa7f71SHong Zhang } 17602efa7f71SHong Zhang 1761f4259b30SLisandro Dalcin B->ops->solveadd = NULL; 1762f4259b30SLisandro Dalcin B->ops->solvetranspose = NULL; 1763f4259b30SLisandro Dalcin B->ops->solvetransposeadd = NULL; 1764f4259b30SLisandro Dalcin B->ops->matsolve = NULL; 17652efa7f71SHong Zhang B->assembled = PETSC_TRUE; 17662efa7f71SHong Zhang B->preallocated = PETSC_TRUE; 17673ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 1768c8342467SHong Zhang } 1769