xref: /petsc/src/mat/impls/baij/seq/baijfact13.c (revision 48a46eb9bd028bec07ec0f396b1a3abb43f14558)
1be1d678aSKris Buschelman 
283287d42SBarry Smith /*
383287d42SBarry Smith     Factorization code for BAIJ format.
483287d42SBarry Smith */
5c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
6af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
783287d42SBarry Smith 
883287d42SBarry Smith /*
983287d42SBarry Smith       Version for when blocks are 3 by 3
1083287d42SBarry Smith */
119371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info) {
1283287d42SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1383287d42SBarry Smith   IS              isrow = b->row, isicol = b->icol;
145d0c19d7SBarry Smith   const PetscInt *r, *ic;
155d0c19d7SBarry Smith   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
16690b6cddSBarry Smith   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
17690b6cddSBarry Smith   PetscInt       *diag_offset = b->diag, idx, *pj;
1883287d42SBarry Smith   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
1983287d42SBarry Smith   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
2083287d42SBarry Smith   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
2183287d42SBarry Smith   MatScalar      *ba = b->a, *aa = a->a;
22182b8fbaSHong Zhang   PetscReal       shift = info->shiftamount;
23a455e926SHong Zhang   PetscBool       allowzeropivot, zeropivotdetected;
2483287d42SBarry Smith 
2583287d42SBarry Smith   PetscFunctionBegin;
269566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
279566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
295f8bbccaSHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
3083287d42SBarry Smith 
3183287d42SBarry Smith   for (i = 0; i < n; i++) {
3283287d42SBarry Smith     nz    = bi[i + 1] - bi[i];
3383287d42SBarry Smith     ajtmp = bj + bi[i];
3483287d42SBarry Smith     for (j = 0; j < nz; j++) {
3583287d42SBarry Smith       x    = rtmp + 9 * ajtmp[j];
3683287d42SBarry Smith       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
3783287d42SBarry Smith     }
3883287d42SBarry Smith     /* load in initial (unfactored row) */
3983287d42SBarry Smith     idx      = r[i];
4083287d42SBarry Smith     nz       = ai[idx + 1] - ai[idx];
4183287d42SBarry Smith     ajtmpold = aj + ai[idx];
4283287d42SBarry Smith     v        = aa + 9 * ai[idx];
4383287d42SBarry Smith     for (j = 0; j < nz; j++) {
4483287d42SBarry Smith       x    = rtmp + 9 * ic[ajtmpold[j]];
459371c9d4SSatish Balay       x[0] = v[0];
469371c9d4SSatish Balay       x[1] = v[1];
479371c9d4SSatish Balay       x[2] = v[2];
489371c9d4SSatish Balay       x[3] = v[3];
499371c9d4SSatish Balay       x[4] = v[4];
509371c9d4SSatish Balay       x[5] = v[5];
519371c9d4SSatish Balay       x[6] = v[6];
529371c9d4SSatish Balay       x[7] = v[7];
539371c9d4SSatish Balay       x[8] = v[8];
5483287d42SBarry Smith       v += 9;
5583287d42SBarry Smith     }
5683287d42SBarry Smith     row = *ajtmp++;
5783287d42SBarry Smith     while (row < i) {
5883287d42SBarry Smith       pc = rtmp + 9 * row;
599371c9d4SSatish Balay       p1 = pc[0];
609371c9d4SSatish Balay       p2 = pc[1];
619371c9d4SSatish Balay       p3 = pc[2];
629371c9d4SSatish Balay       p4 = pc[3];
639371c9d4SSatish Balay       p5 = pc[4];
649371c9d4SSatish Balay       p6 = pc[5];
659371c9d4SSatish Balay       p7 = pc[6];
669371c9d4SSatish Balay       p8 = pc[7];
679371c9d4SSatish Balay       p9 = pc[8];
689371c9d4SSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
6983287d42SBarry Smith         pv    = ba + 9 * diag_offset[row];
7083287d42SBarry Smith         pj    = bj + diag_offset[row] + 1;
719371c9d4SSatish Balay         x1    = pv[0];
729371c9d4SSatish Balay         x2    = pv[1];
739371c9d4SSatish Balay         x3    = pv[2];
749371c9d4SSatish Balay         x4    = pv[3];
759371c9d4SSatish Balay         x5    = pv[4];
769371c9d4SSatish Balay         x6    = pv[5];
779371c9d4SSatish Balay         x7    = pv[6];
789371c9d4SSatish Balay         x8    = pv[7];
799371c9d4SSatish Balay         x9    = pv[8];
8083287d42SBarry Smith         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
8183287d42SBarry Smith         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
8283287d42SBarry Smith         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
8383287d42SBarry Smith 
8483287d42SBarry Smith         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
8583287d42SBarry Smith         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
8683287d42SBarry Smith         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
8783287d42SBarry Smith 
8883287d42SBarry Smith         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
8983287d42SBarry Smith         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
9083287d42SBarry Smith         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
9183287d42SBarry Smith         nz         = bi[row + 1] - diag_offset[row] - 1;
9283287d42SBarry Smith         pv += 9;
9383287d42SBarry Smith         for (j = 0; j < nz; j++) {
949371c9d4SSatish Balay           x1 = pv[0];
959371c9d4SSatish Balay           x2 = pv[1];
969371c9d4SSatish Balay           x3 = pv[2];
979371c9d4SSatish Balay           x4 = pv[3];
989371c9d4SSatish Balay           x5 = pv[4];
999371c9d4SSatish Balay           x6 = pv[5];
1009371c9d4SSatish Balay           x7 = pv[6];
1019371c9d4SSatish Balay           x8 = pv[7];
1029371c9d4SSatish Balay           x9 = pv[8];
10383287d42SBarry Smith           x  = rtmp + 9 * pj[j];
10483287d42SBarry Smith           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
10583287d42SBarry Smith           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
10683287d42SBarry Smith           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
10783287d42SBarry Smith 
10883287d42SBarry Smith           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
10983287d42SBarry Smith           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
11083287d42SBarry Smith           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
11183287d42SBarry Smith 
11283287d42SBarry Smith           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
11383287d42SBarry Smith           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
11483287d42SBarry Smith           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
11583287d42SBarry Smith           pv += 9;
11683287d42SBarry Smith         }
1179566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
11883287d42SBarry Smith       }
11983287d42SBarry Smith       row = *ajtmp++;
12083287d42SBarry Smith     }
12183287d42SBarry Smith     /* finished row so stick it into b->a */
12283287d42SBarry Smith     pv = ba + 9 * bi[i];
12383287d42SBarry Smith     pj = bj + bi[i];
12483287d42SBarry Smith     nz = bi[i + 1] - bi[i];
12583287d42SBarry Smith     for (j = 0; j < nz; j++) {
12683287d42SBarry Smith       x     = rtmp + 9 * pj[j];
1279371c9d4SSatish Balay       pv[0] = x[0];
1289371c9d4SSatish Balay       pv[1] = x[1];
1299371c9d4SSatish Balay       pv[2] = x[2];
1309371c9d4SSatish Balay       pv[3] = x[3];
1319371c9d4SSatish Balay       pv[4] = x[4];
1329371c9d4SSatish Balay       pv[5] = x[5];
1339371c9d4SSatish Balay       pv[6] = x[6];
1349371c9d4SSatish Balay       pv[7] = x[7];
1359371c9d4SSatish Balay       pv[8] = x[8];
13683287d42SBarry Smith       pv += 9;
13783287d42SBarry Smith     }
13883287d42SBarry Smith     /* invert diagonal block */
13983287d42SBarry Smith     w = ba + 9 * diag_offset[i];
1409566063dSJacob Faibussowitsch     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
1417b6c816cSBarry Smith     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
14283287d42SBarry Smith   }
14383287d42SBarry Smith 
1449566063dSJacob Faibussowitsch   PetscCall(PetscFree(rtmp));
1459566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
1469566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
14726fbe8dcSKarl Rupp 
14806e38f1dSHong Zhang   C->ops->solve          = MatSolve_SeqBAIJ_3_inplace;
14906e38f1dSHong Zhang   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace;
15083287d42SBarry Smith   C->assembled           = PETSC_TRUE;
15126fbe8dcSKarl Rupp 
1529566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
15383287d42SBarry Smith   PetscFunctionReturn(0);
15483287d42SBarry Smith }
15517542729SShri Abhyankar 
1564dd39f65SShri Abhyankar /* MatLUFactorNumeric_SeqBAIJ_3 -
1574dd39f65SShri Abhyankar      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
15896b95a6bSBarry Smith        PetscKernel_A_gets_A_times_B()
15996b95a6bSBarry Smith        PetscKernel_A_gets_A_minus_B_times_C()
16096b95a6bSBarry Smith        PetscKernel_A_gets_inverse_A()
16117542729SShri Abhyankar */
1629371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info) {
16317542729SShri Abhyankar   Mat             C = B;
16417542729SShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
16517542729SShri Abhyankar   IS              isrow = b->row, isicol = b->icol;
1665a586d82SBarry Smith   const PetscInt *r, *ic;
167bbd65245SShri Abhyankar   PetscInt        i, j, k, nz, nzL, row;
168bbd65245SShri Abhyankar   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
169bbd65245SShri Abhyankar   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
17017542729SShri Abhyankar   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
171bbd65245SShri Abhyankar   PetscInt        flg;
172182b8fbaSHong Zhang   PetscReal       shift = info->shiftamount;
173a455e926SHong Zhang   PetscBool       allowzeropivot, zeropivotdetected;
17417542729SShri Abhyankar 
17517542729SShri Abhyankar   PetscFunctionBegin;
1769566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
1779566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
1780164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
17917542729SShri Abhyankar 
18017542729SShri Abhyankar   /* generate work space needed by the factorization */
1819566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
1829566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(rtmp, bs2 * n));
18317542729SShri Abhyankar 
18417542729SShri Abhyankar   for (i = 0; i < n; i++) {
18517542729SShri Abhyankar     /* zero rtmp */
18617542729SShri Abhyankar     /* L part */
18717542729SShri Abhyankar     nz    = bi[i + 1] - bi[i];
18817542729SShri Abhyankar     bjtmp = bj + bi[i];
189*48a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
19017542729SShri Abhyankar 
19117542729SShri Abhyankar     /* U part */
1920c4413a7SShri Abhyankar     nz    = bdiag[i] - bdiag[i + 1];
1930c4413a7SShri Abhyankar     bjtmp = bj + bdiag[i + 1] + 1;
194*48a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
1950c4413a7SShri Abhyankar 
1960c4413a7SShri Abhyankar     /* load in initial (unfactored row) */
1970c4413a7SShri Abhyankar     nz    = ai[r[i] + 1] - ai[r[i]];
1980c4413a7SShri Abhyankar     ajtmp = aj + ai[r[i]];
1990c4413a7SShri Abhyankar     v     = aa + bs2 * ai[r[i]];
200*48a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
2010c4413a7SShri Abhyankar 
2020c4413a7SShri Abhyankar     /* elimination */
2030c4413a7SShri Abhyankar     bjtmp = bj + bi[i];
2040c4413a7SShri Abhyankar     nzL   = bi[i + 1] - bi[i];
2050c4413a7SShri Abhyankar     for (k = 0; k < nzL; k++) {
2060c4413a7SShri Abhyankar       row = bjtmp[k];
2070c4413a7SShri Abhyankar       pc  = rtmp + bs2 * row;
208c35f09e5SBarry Smith       for (flg = 0, j = 0; j < bs2; j++) {
209c35f09e5SBarry Smith         if (pc[j] != 0.0) {
210c35f09e5SBarry Smith           flg = 1;
211c35f09e5SBarry Smith           break;
212c35f09e5SBarry Smith         }
213c35f09e5SBarry Smith       }
2140c4413a7SShri Abhyankar       if (flg) {
2150c4413a7SShri Abhyankar         pv = b->a + bs2 * bdiag[row];
21696b95a6bSBarry Smith         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
2179566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
2180c4413a7SShri Abhyankar 
2190c4413a7SShri Abhyankar         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
2200c4413a7SShri Abhyankar         pv = b->a + bs2 * (bdiag[row + 1] + 1);
2210c4413a7SShri Abhyankar         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
2220c4413a7SShri Abhyankar         for (j = 0; j < nz; j++) {
22396b95a6bSBarry Smith           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
2240c4413a7SShri Abhyankar           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
2250c4413a7SShri Abhyankar           v = rtmp + bs2 * pj[j];
2269566063dSJacob Faibussowitsch           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
2270c4413a7SShri Abhyankar           pv += bs2;
2280c4413a7SShri Abhyankar         }
2299566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
2300c4413a7SShri Abhyankar       }
2310c4413a7SShri Abhyankar     }
2320c4413a7SShri Abhyankar 
2330c4413a7SShri Abhyankar     /* finished row so stick it into b->a */
2340c4413a7SShri Abhyankar     /* L part */
2350c4413a7SShri Abhyankar     pv = b->a + bs2 * bi[i];
2360c4413a7SShri Abhyankar     pj = b->j + bi[i];
2370c4413a7SShri Abhyankar     nz = bi[i + 1] - bi[i];
238*48a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
2390c4413a7SShri Abhyankar 
240a5b23f4aSJose E. Roman     /* Mark diagonal and invert diagonal for simpler triangular solves */
2410c4413a7SShri Abhyankar     pv = b->a + bs2 * bdiag[i];
2420c4413a7SShri Abhyankar     pj = b->j + bdiag[i];
2439566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
2449566063dSJacob Faibussowitsch     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
2457b6c816cSBarry Smith     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2460c4413a7SShri Abhyankar 
2470c4413a7SShri Abhyankar     /* U part */
2480c4413a7SShri Abhyankar     pj = b->j + bdiag[i + 1] + 1;
2490c4413a7SShri Abhyankar     pv = b->a + bs2 * (bdiag[i + 1] + 1);
2500c4413a7SShri Abhyankar     nz = bdiag[i] - bdiag[i + 1] - 1;
251*48a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
2520c4413a7SShri Abhyankar   }
2530c4413a7SShri Abhyankar 
2549566063dSJacob Faibussowitsch   PetscCall(PetscFree2(rtmp, mwork));
2559566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
2569566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
25726fbe8dcSKarl Rupp 
2584dd39f65SShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_3;
2594dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
2600c4413a7SShri Abhyankar   C->assembled           = PETSC_TRUE;
26126fbe8dcSKarl Rupp 
2629566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
2630c4413a7SShri Abhyankar   PetscFunctionReturn(0);
2640c4413a7SShri Abhyankar }
2650c4413a7SShri Abhyankar 
2669371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) {
26717542729SShri Abhyankar   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
26817542729SShri Abhyankar   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
26917542729SShri Abhyankar   PetscInt    *ajtmpold, *ajtmp, nz, row;
27017542729SShri Abhyankar   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
27117542729SShri Abhyankar   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
27217542729SShri Abhyankar   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
27317542729SShri Abhyankar   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
27417542729SShri Abhyankar   MatScalar   *ba = b->a, *aa = a->a;
275182b8fbaSHong Zhang   PetscReal    shift = info->shiftamount;
276a455e926SHong Zhang   PetscBool    allowzeropivot, zeropivotdetected;
27717542729SShri Abhyankar 
27817542729SShri Abhyankar   PetscFunctionBegin;
2799566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
2800164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
28117542729SShri Abhyankar 
28217542729SShri Abhyankar   for (i = 0; i < n; i++) {
28317542729SShri Abhyankar     nz    = bi[i + 1] - bi[i];
28417542729SShri Abhyankar     ajtmp = bj + bi[i];
28517542729SShri Abhyankar     for (j = 0; j < nz; j++) {
28617542729SShri Abhyankar       x    = rtmp + 9 * ajtmp[j];
28717542729SShri Abhyankar       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
28817542729SShri Abhyankar     }
28917542729SShri Abhyankar     /* load in initial (unfactored row) */
29017542729SShri Abhyankar     nz       = ai[i + 1] - ai[i];
29117542729SShri Abhyankar     ajtmpold = aj + ai[i];
29217542729SShri Abhyankar     v        = aa + 9 * ai[i];
29317542729SShri Abhyankar     for (j = 0; j < nz; j++) {
29417542729SShri Abhyankar       x    = rtmp + 9 * ajtmpold[j];
2959371c9d4SSatish Balay       x[0] = v[0];
2969371c9d4SSatish Balay       x[1] = v[1];
2979371c9d4SSatish Balay       x[2] = v[2];
2989371c9d4SSatish Balay       x[3] = v[3];
2999371c9d4SSatish Balay       x[4] = v[4];
3009371c9d4SSatish Balay       x[5] = v[5];
3019371c9d4SSatish Balay       x[6] = v[6];
3029371c9d4SSatish Balay       x[7] = v[7];
3039371c9d4SSatish Balay       x[8] = v[8];
30417542729SShri Abhyankar       v += 9;
30517542729SShri Abhyankar     }
30617542729SShri Abhyankar     row = *ajtmp++;
30717542729SShri Abhyankar     while (row < i) {
30817542729SShri Abhyankar       pc = rtmp + 9 * row;
3099371c9d4SSatish Balay       p1 = pc[0];
3109371c9d4SSatish Balay       p2 = pc[1];
3119371c9d4SSatish Balay       p3 = pc[2];
3129371c9d4SSatish Balay       p4 = pc[3];
3139371c9d4SSatish Balay       p5 = pc[4];
3149371c9d4SSatish Balay       p6 = pc[5];
3159371c9d4SSatish Balay       p7 = pc[6];
3169371c9d4SSatish Balay       p8 = pc[7];
3179371c9d4SSatish Balay       p9 = pc[8];
3189371c9d4SSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
31917542729SShri Abhyankar         pv    = ba + 9 * diag_offset[row];
32017542729SShri Abhyankar         pj    = bj + diag_offset[row] + 1;
3219371c9d4SSatish Balay         x1    = pv[0];
3229371c9d4SSatish Balay         x2    = pv[1];
3239371c9d4SSatish Balay         x3    = pv[2];
3249371c9d4SSatish Balay         x4    = pv[3];
3259371c9d4SSatish Balay         x5    = pv[4];
3269371c9d4SSatish Balay         x6    = pv[5];
3279371c9d4SSatish Balay         x7    = pv[6];
3289371c9d4SSatish Balay         x8    = pv[7];
3299371c9d4SSatish Balay         x9    = pv[8];
33017542729SShri Abhyankar         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
33117542729SShri Abhyankar         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
33217542729SShri Abhyankar         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
33317542729SShri Abhyankar 
33417542729SShri Abhyankar         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
33517542729SShri Abhyankar         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
33617542729SShri Abhyankar         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
33717542729SShri Abhyankar 
33817542729SShri Abhyankar         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
33917542729SShri Abhyankar         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
34017542729SShri Abhyankar         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
34117542729SShri Abhyankar 
34217542729SShri Abhyankar         nz = bi[row + 1] - diag_offset[row] - 1;
34317542729SShri Abhyankar         pv += 9;
34417542729SShri Abhyankar         for (j = 0; j < nz; j++) {
3459371c9d4SSatish Balay           x1 = pv[0];
3469371c9d4SSatish Balay           x2 = pv[1];
3479371c9d4SSatish Balay           x3 = pv[2];
3489371c9d4SSatish Balay           x4 = pv[3];
3499371c9d4SSatish Balay           x5 = pv[4];
3509371c9d4SSatish Balay           x6 = pv[5];
3519371c9d4SSatish Balay           x7 = pv[6];
3529371c9d4SSatish Balay           x8 = pv[7];
3539371c9d4SSatish Balay           x9 = pv[8];
35417542729SShri Abhyankar           x  = rtmp + 9 * pj[j];
35517542729SShri Abhyankar           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
35617542729SShri Abhyankar           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
35717542729SShri Abhyankar           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
35817542729SShri Abhyankar 
35917542729SShri Abhyankar           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
36017542729SShri Abhyankar           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
36117542729SShri Abhyankar           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
36217542729SShri Abhyankar 
36317542729SShri Abhyankar           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
36417542729SShri Abhyankar           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
36517542729SShri Abhyankar           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
36617542729SShri Abhyankar           pv += 9;
36717542729SShri Abhyankar         }
3689566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
36917542729SShri Abhyankar       }
37017542729SShri Abhyankar       row = *ajtmp++;
37117542729SShri Abhyankar     }
37217542729SShri Abhyankar     /* finished row so stick it into b->a */
37317542729SShri Abhyankar     pv = ba + 9 * bi[i];
37417542729SShri Abhyankar     pj = bj + bi[i];
37517542729SShri Abhyankar     nz = bi[i + 1] - bi[i];
37617542729SShri Abhyankar     for (j = 0; j < nz; j++) {
37717542729SShri Abhyankar       x     = rtmp + 9 * pj[j];
3789371c9d4SSatish Balay       pv[0] = x[0];
3799371c9d4SSatish Balay       pv[1] = x[1];
3809371c9d4SSatish Balay       pv[2] = x[2];
3819371c9d4SSatish Balay       pv[3] = x[3];
3829371c9d4SSatish Balay       pv[4] = x[4];
3839371c9d4SSatish Balay       pv[5] = x[5];
3849371c9d4SSatish Balay       pv[6] = x[6];
3859371c9d4SSatish Balay       pv[7] = x[7];
3869371c9d4SSatish Balay       pv[8] = x[8];
38717542729SShri Abhyankar       pv += 9;
38817542729SShri Abhyankar     }
38917542729SShri Abhyankar     /* invert diagonal block */
39017542729SShri Abhyankar     w = ba + 9 * diag_offset[i];
3919566063dSJacob Faibussowitsch     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
3927b6c816cSBarry Smith     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
39317542729SShri Abhyankar   }
39417542729SShri Abhyankar 
3959566063dSJacob Faibussowitsch   PetscCall(PetscFree(rtmp));
39626fbe8dcSKarl Rupp 
39706e38f1dSHong Zhang   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace;
39806e38f1dSHong Zhang   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace;
39917542729SShri Abhyankar   C->assembled           = PETSC_TRUE;
40026fbe8dcSKarl Rupp 
4019566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
40217542729SShri Abhyankar   PetscFunctionReturn(0);
40317542729SShri Abhyankar }
40417542729SShri Abhyankar 
40517542729SShri Abhyankar /*
4064dd39f65SShri Abhyankar   MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering -
4074dd39f65SShri Abhyankar     copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace()
40817542729SShri Abhyankar */
4099371c9d4SSatish Balay PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) {
41017542729SShri Abhyankar   Mat             C = B;
41117542729SShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
412bbd65245SShri Abhyankar   PetscInt        i, j, k, nz, nzL, row;
413bbd65245SShri Abhyankar   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
414bbd65245SShri Abhyankar   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
41517542729SShri Abhyankar   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
416bbd65245SShri Abhyankar   PetscInt        flg;
417182b8fbaSHong Zhang   PetscReal       shift = info->shiftamount;
418a455e926SHong Zhang   PetscBool       allowzeropivot, zeropivotdetected;
41917542729SShri Abhyankar 
42017542729SShri Abhyankar   PetscFunctionBegin;
4210164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
4220164db54SHong Zhang 
42317542729SShri Abhyankar   /* generate work space needed by the factorization */
4249566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
4259566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(rtmp, bs2 * n));
42617542729SShri Abhyankar 
42717542729SShri Abhyankar   for (i = 0; i < n; i++) {
42817542729SShri Abhyankar     /* zero rtmp */
42917542729SShri Abhyankar     /* L part */
43017542729SShri Abhyankar     nz    = bi[i + 1] - bi[i];
43117542729SShri Abhyankar     bjtmp = bj + bi[i];
432*48a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
43317542729SShri Abhyankar 
43417542729SShri Abhyankar     /* U part */
435b2b2dd24SShri Abhyankar     nz    = bdiag[i] - bdiag[i + 1];
436b2b2dd24SShri Abhyankar     bjtmp = bj + bdiag[i + 1] + 1;
437*48a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
438b2b2dd24SShri Abhyankar 
439b2b2dd24SShri Abhyankar     /* load in initial (unfactored row) */
440b2b2dd24SShri Abhyankar     nz    = ai[i + 1] - ai[i];
441b2b2dd24SShri Abhyankar     ajtmp = aj + ai[i];
442b2b2dd24SShri Abhyankar     v     = aa + bs2 * ai[i];
443*48a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
444b2b2dd24SShri Abhyankar 
445b2b2dd24SShri Abhyankar     /* elimination */
446b2b2dd24SShri Abhyankar     bjtmp = bj + bi[i];
447b2b2dd24SShri Abhyankar     nzL   = bi[i + 1] - bi[i];
448b2b2dd24SShri Abhyankar     for (k = 0; k < nzL; k++) {
449b2b2dd24SShri Abhyankar       row = bjtmp[k];
450b2b2dd24SShri Abhyankar       pc  = rtmp + bs2 * row;
451c35f09e5SBarry Smith       for (flg = 0, j = 0; j < bs2; j++) {
452c35f09e5SBarry Smith         if (pc[j] != 0.0) {
453c35f09e5SBarry Smith           flg = 1;
454c35f09e5SBarry Smith           break;
455c35f09e5SBarry Smith         }
456c35f09e5SBarry Smith       }
457b2b2dd24SShri Abhyankar       if (flg) {
458b2b2dd24SShri Abhyankar         pv = b->a + bs2 * bdiag[row];
45996b95a6bSBarry Smith         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
4609566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
461b2b2dd24SShri Abhyankar 
462b2b2dd24SShri Abhyankar         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
463b2b2dd24SShri Abhyankar         pv = b->a + bs2 * (bdiag[row + 1] + 1);
464b2b2dd24SShri Abhyankar         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
465b2b2dd24SShri Abhyankar         for (j = 0; j < nz; j++) {
46696b95a6bSBarry Smith           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
467b2b2dd24SShri Abhyankar           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
468b2b2dd24SShri Abhyankar           v = rtmp + bs2 * pj[j];
4699566063dSJacob Faibussowitsch           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
470b2b2dd24SShri Abhyankar           pv += bs2;
471b2b2dd24SShri Abhyankar         }
4729566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
473b2b2dd24SShri Abhyankar       }
474b2b2dd24SShri Abhyankar     }
475b2b2dd24SShri Abhyankar 
476b2b2dd24SShri Abhyankar     /* finished row so stick it into b->a */
477b2b2dd24SShri Abhyankar     /* L part */
478b2b2dd24SShri Abhyankar     pv = b->a + bs2 * bi[i];
479b2b2dd24SShri Abhyankar     pj = b->j + bi[i];
480b2b2dd24SShri Abhyankar     nz = bi[i + 1] - bi[i];
481*48a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
482b2b2dd24SShri Abhyankar 
483a5b23f4aSJose E. Roman     /* Mark diagonal and invert diagonal for simpler triangular solves */
484b2b2dd24SShri Abhyankar     pv = b->a + bs2 * bdiag[i];
485b2b2dd24SShri Abhyankar     pj = b->j + bdiag[i];
4869566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
4879566063dSJacob Faibussowitsch     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
4887b6c816cSBarry Smith     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
489b2b2dd24SShri Abhyankar 
490b2b2dd24SShri Abhyankar     /* U part */
491b2b2dd24SShri Abhyankar     pv = b->a + bs2 * (bdiag[i + 1] + 1);
492b2b2dd24SShri Abhyankar     pj = b->j + bdiag[i + 1] + 1;
493b2b2dd24SShri Abhyankar     nz = bdiag[i] - bdiag[i + 1] - 1;
494*48a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
495b2b2dd24SShri Abhyankar   }
4969566063dSJacob Faibussowitsch   PetscCall(PetscFree2(rtmp, mwork));
49726fbe8dcSKarl Rupp 
4984dd39f65SShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering;
4999f5c0bcdSBarry Smith   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_3_NaturalOrdering;
5009f5c0bcdSBarry Smith   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering;
5014dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
502b2b2dd24SShri Abhyankar   C->assembled           = PETSC_TRUE;
50326fbe8dcSKarl Rupp 
5049566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
505b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
506b2b2dd24SShri Abhyankar }
507