xref: /petsc/src/mat/impls/baij/seq/baijfact13.c (revision d71ae5a4db6382e7f06317b8d368875286fe9008)
1be1d678aSKris Buschelman 
283287d42SBarry Smith /*
383287d42SBarry Smith     Factorization code for BAIJ format.
483287d42SBarry Smith */
5c6db04a5SJed Brown #include <../src/mat/impls/baij/seq/baij.h>
6af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
783287d42SBarry Smith 
883287d42SBarry Smith /*
983287d42SBarry Smith       Version for when blocks are 3 by 3
1083287d42SBarry Smith */
11*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_inplace(Mat C, Mat A, const MatFactorInfo *info)
12*d71ae5a4SJacob Faibussowitsch {
1383287d42SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
1483287d42SBarry Smith   IS              isrow = b->row, isicol = b->icol;
155d0c19d7SBarry Smith   const PetscInt *r, *ic;
165d0c19d7SBarry Smith   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
17690b6cddSBarry Smith   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
18690b6cddSBarry Smith   PetscInt       *diag_offset = b->diag, idx, *pj;
1983287d42SBarry Smith   MatScalar      *pv, *v, *rtmp, *pc, *w, *x;
2083287d42SBarry Smith   MatScalar       p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
2183287d42SBarry Smith   MatScalar       p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
2283287d42SBarry Smith   MatScalar      *ba = b->a, *aa = a->a;
23182b8fbaSHong Zhang   PetscReal       shift = info->shiftamount;
24a455e926SHong Zhang   PetscBool       allowzeropivot, zeropivotdetected;
2583287d42SBarry Smith 
2683287d42SBarry Smith   PetscFunctionBegin;
279566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
289566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
299566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
305f8bbccaSHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
3183287d42SBarry Smith 
3283287d42SBarry Smith   for (i = 0; i < n; i++) {
3383287d42SBarry Smith     nz    = bi[i + 1] - bi[i];
3483287d42SBarry Smith     ajtmp = bj + bi[i];
3583287d42SBarry Smith     for (j = 0; j < nz; j++) {
3683287d42SBarry Smith       x    = rtmp + 9 * ajtmp[j];
3783287d42SBarry Smith       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
3883287d42SBarry Smith     }
3983287d42SBarry Smith     /* load in initial (unfactored row) */
4083287d42SBarry Smith     idx      = r[i];
4183287d42SBarry Smith     nz       = ai[idx + 1] - ai[idx];
4283287d42SBarry Smith     ajtmpold = aj + ai[idx];
4383287d42SBarry Smith     v        = aa + 9 * ai[idx];
4483287d42SBarry Smith     for (j = 0; j < nz; j++) {
4583287d42SBarry Smith       x    = rtmp + 9 * ic[ajtmpold[j]];
469371c9d4SSatish Balay       x[0] = v[0];
479371c9d4SSatish Balay       x[1] = v[1];
489371c9d4SSatish Balay       x[2] = v[2];
499371c9d4SSatish Balay       x[3] = v[3];
509371c9d4SSatish Balay       x[4] = v[4];
519371c9d4SSatish Balay       x[5] = v[5];
529371c9d4SSatish Balay       x[6] = v[6];
539371c9d4SSatish Balay       x[7] = v[7];
549371c9d4SSatish Balay       x[8] = v[8];
5583287d42SBarry Smith       v += 9;
5683287d42SBarry Smith     }
5783287d42SBarry Smith     row = *ajtmp++;
5883287d42SBarry Smith     while (row < i) {
5983287d42SBarry Smith       pc = rtmp + 9 * row;
609371c9d4SSatish Balay       p1 = pc[0];
619371c9d4SSatish Balay       p2 = pc[1];
629371c9d4SSatish Balay       p3 = pc[2];
639371c9d4SSatish Balay       p4 = pc[3];
649371c9d4SSatish Balay       p5 = pc[4];
659371c9d4SSatish Balay       p6 = pc[5];
669371c9d4SSatish Balay       p7 = pc[6];
679371c9d4SSatish Balay       p8 = pc[7];
689371c9d4SSatish Balay       p9 = pc[8];
699371c9d4SSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
7083287d42SBarry Smith         pv    = ba + 9 * diag_offset[row];
7183287d42SBarry Smith         pj    = bj + diag_offset[row] + 1;
729371c9d4SSatish Balay         x1    = pv[0];
739371c9d4SSatish Balay         x2    = pv[1];
749371c9d4SSatish Balay         x3    = pv[2];
759371c9d4SSatish Balay         x4    = pv[3];
769371c9d4SSatish Balay         x5    = pv[4];
779371c9d4SSatish Balay         x6    = pv[5];
789371c9d4SSatish Balay         x7    = pv[6];
799371c9d4SSatish Balay         x8    = pv[7];
809371c9d4SSatish Balay         x9    = pv[8];
8183287d42SBarry Smith         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
8283287d42SBarry Smith         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
8383287d42SBarry Smith         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
8483287d42SBarry Smith 
8583287d42SBarry Smith         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
8683287d42SBarry Smith         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
8783287d42SBarry Smith         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
8883287d42SBarry Smith 
8983287d42SBarry Smith         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
9083287d42SBarry Smith         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
9183287d42SBarry Smith         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
9283287d42SBarry Smith         nz         = bi[row + 1] - diag_offset[row] - 1;
9383287d42SBarry Smith         pv += 9;
9483287d42SBarry Smith         for (j = 0; j < nz; j++) {
959371c9d4SSatish Balay           x1 = pv[0];
969371c9d4SSatish Balay           x2 = pv[1];
979371c9d4SSatish Balay           x3 = pv[2];
989371c9d4SSatish Balay           x4 = pv[3];
999371c9d4SSatish Balay           x5 = pv[4];
1009371c9d4SSatish Balay           x6 = pv[5];
1019371c9d4SSatish Balay           x7 = pv[6];
1029371c9d4SSatish Balay           x8 = pv[7];
1039371c9d4SSatish Balay           x9 = pv[8];
10483287d42SBarry Smith           x  = rtmp + 9 * pj[j];
10583287d42SBarry Smith           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
10683287d42SBarry Smith           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
10783287d42SBarry Smith           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
10883287d42SBarry Smith 
10983287d42SBarry Smith           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
11083287d42SBarry Smith           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
11183287d42SBarry Smith           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
11283287d42SBarry Smith 
11383287d42SBarry Smith           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
11483287d42SBarry Smith           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
11583287d42SBarry Smith           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
11683287d42SBarry Smith           pv += 9;
11783287d42SBarry Smith         }
1189566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
11983287d42SBarry Smith       }
12083287d42SBarry Smith       row = *ajtmp++;
12183287d42SBarry Smith     }
12283287d42SBarry Smith     /* finished row so stick it into b->a */
12383287d42SBarry Smith     pv = ba + 9 * bi[i];
12483287d42SBarry Smith     pj = bj + bi[i];
12583287d42SBarry Smith     nz = bi[i + 1] - bi[i];
12683287d42SBarry Smith     for (j = 0; j < nz; j++) {
12783287d42SBarry Smith       x     = rtmp + 9 * pj[j];
1289371c9d4SSatish Balay       pv[0] = x[0];
1299371c9d4SSatish Balay       pv[1] = x[1];
1309371c9d4SSatish Balay       pv[2] = x[2];
1319371c9d4SSatish Balay       pv[3] = x[3];
1329371c9d4SSatish Balay       pv[4] = x[4];
1339371c9d4SSatish Balay       pv[5] = x[5];
1349371c9d4SSatish Balay       pv[6] = x[6];
1359371c9d4SSatish Balay       pv[7] = x[7];
1369371c9d4SSatish Balay       pv[8] = x[8];
13783287d42SBarry Smith       pv += 9;
13883287d42SBarry Smith     }
13983287d42SBarry Smith     /* invert diagonal block */
14083287d42SBarry Smith     w = ba + 9 * diag_offset[i];
1419566063dSJacob Faibussowitsch     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
1427b6c816cSBarry Smith     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
14383287d42SBarry Smith   }
14483287d42SBarry Smith 
1459566063dSJacob Faibussowitsch   PetscCall(PetscFree(rtmp));
1469566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
1479566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
14826fbe8dcSKarl Rupp 
14906e38f1dSHong Zhang   C->ops->solve          = MatSolve_SeqBAIJ_3_inplace;
15006e38f1dSHong Zhang   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_inplace;
15183287d42SBarry Smith   C->assembled           = PETSC_TRUE;
15226fbe8dcSKarl Rupp 
1539566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
15483287d42SBarry Smith   PetscFunctionReturn(0);
15583287d42SBarry Smith }
15617542729SShri Abhyankar 
1574dd39f65SShri Abhyankar /* MatLUFactorNumeric_SeqBAIJ_3 -
1584dd39f65SShri Abhyankar      copied from MatLUFactorNumeric_SeqBAIJ_N_inplace() and manually re-implemented
15996b95a6bSBarry Smith        PetscKernel_A_gets_A_times_B()
16096b95a6bSBarry Smith        PetscKernel_A_gets_A_minus_B_times_C()
16196b95a6bSBarry Smith        PetscKernel_A_gets_inverse_A()
16217542729SShri Abhyankar */
163*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3(Mat B, Mat A, const MatFactorInfo *info)
164*d71ae5a4SJacob Faibussowitsch {
16517542729SShri Abhyankar   Mat             C = B;
16617542729SShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
16717542729SShri Abhyankar   IS              isrow = b->row, isicol = b->icol;
1685a586d82SBarry Smith   const PetscInt *r, *ic;
169bbd65245SShri Abhyankar   PetscInt        i, j, k, nz, nzL, row;
170bbd65245SShri Abhyankar   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
171bbd65245SShri Abhyankar   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
17217542729SShri Abhyankar   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
173bbd65245SShri Abhyankar   PetscInt        flg;
174182b8fbaSHong Zhang   PetscReal       shift = info->shiftamount;
175a455e926SHong Zhang   PetscBool       allowzeropivot, zeropivotdetected;
17617542729SShri Abhyankar 
17717542729SShri Abhyankar   PetscFunctionBegin;
1789566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow, &r));
1799566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isicol, &ic));
1800164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
18117542729SShri Abhyankar 
18217542729SShri Abhyankar   /* generate work space needed by the factorization */
1839566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
1849566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(rtmp, bs2 * n));
18517542729SShri Abhyankar 
18617542729SShri Abhyankar   for (i = 0; i < n; i++) {
18717542729SShri Abhyankar     /* zero rtmp */
18817542729SShri Abhyankar     /* L part */
18917542729SShri Abhyankar     nz    = bi[i + 1] - bi[i];
19017542729SShri Abhyankar     bjtmp = bj + bi[i];
19148a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
19217542729SShri Abhyankar 
19317542729SShri Abhyankar     /* U part */
1940c4413a7SShri Abhyankar     nz    = bdiag[i] - bdiag[i + 1];
1950c4413a7SShri Abhyankar     bjtmp = bj + bdiag[i + 1] + 1;
19648a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
1970c4413a7SShri Abhyankar 
1980c4413a7SShri Abhyankar     /* load in initial (unfactored row) */
1990c4413a7SShri Abhyankar     nz    = ai[r[i] + 1] - ai[r[i]];
2000c4413a7SShri Abhyankar     ajtmp = aj + ai[r[i]];
2010c4413a7SShri Abhyankar     v     = aa + bs2 * ai[r[i]];
20248a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
2030c4413a7SShri Abhyankar 
2040c4413a7SShri Abhyankar     /* elimination */
2050c4413a7SShri Abhyankar     bjtmp = bj + bi[i];
2060c4413a7SShri Abhyankar     nzL   = bi[i + 1] - bi[i];
2070c4413a7SShri Abhyankar     for (k = 0; k < nzL; k++) {
2080c4413a7SShri Abhyankar       row = bjtmp[k];
2090c4413a7SShri Abhyankar       pc  = rtmp + bs2 * row;
210c35f09e5SBarry Smith       for (flg = 0, j = 0; j < bs2; j++) {
211c35f09e5SBarry Smith         if (pc[j] != 0.0) {
212c35f09e5SBarry Smith           flg = 1;
213c35f09e5SBarry Smith           break;
214c35f09e5SBarry Smith         }
215c35f09e5SBarry Smith       }
2160c4413a7SShri Abhyankar       if (flg) {
2170c4413a7SShri Abhyankar         pv = b->a + bs2 * bdiag[row];
21896b95a6bSBarry Smith         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
2199566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
2200c4413a7SShri Abhyankar 
2210c4413a7SShri Abhyankar         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
2220c4413a7SShri Abhyankar         pv = b->a + bs2 * (bdiag[row + 1] + 1);
2230c4413a7SShri Abhyankar         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
2240c4413a7SShri Abhyankar         for (j = 0; j < nz; j++) {
22596b95a6bSBarry Smith           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
2260c4413a7SShri Abhyankar           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
2270c4413a7SShri Abhyankar           v = rtmp + bs2 * pj[j];
2289566063dSJacob Faibussowitsch           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
2290c4413a7SShri Abhyankar           pv += bs2;
2300c4413a7SShri Abhyankar         }
2319566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
2320c4413a7SShri Abhyankar       }
2330c4413a7SShri Abhyankar     }
2340c4413a7SShri Abhyankar 
2350c4413a7SShri Abhyankar     /* finished row so stick it into b->a */
2360c4413a7SShri Abhyankar     /* L part */
2370c4413a7SShri Abhyankar     pv = b->a + bs2 * bi[i];
2380c4413a7SShri Abhyankar     pj = b->j + bi[i];
2390c4413a7SShri Abhyankar     nz = bi[i + 1] - bi[i];
24048a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
2410c4413a7SShri Abhyankar 
242a5b23f4aSJose E. Roman     /* Mark diagonal and invert diagonal for simpler triangular solves */
2430c4413a7SShri Abhyankar     pv = b->a + bs2 * bdiag[i];
2440c4413a7SShri Abhyankar     pj = b->j + bdiag[i];
2459566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
2469566063dSJacob Faibussowitsch     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
2477b6c816cSBarry Smith     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
2480c4413a7SShri Abhyankar 
2490c4413a7SShri Abhyankar     /* U part */
2500c4413a7SShri Abhyankar     pj = b->j + bdiag[i + 1] + 1;
2510c4413a7SShri Abhyankar     pv = b->a + bs2 * (bdiag[i + 1] + 1);
2520c4413a7SShri Abhyankar     nz = bdiag[i] - bdiag[i + 1] - 1;
25348a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
2540c4413a7SShri Abhyankar   }
2550c4413a7SShri Abhyankar 
2569566063dSJacob Faibussowitsch   PetscCall(PetscFree2(rtmp, mwork));
2579566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isicol, &ic));
2589566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &r));
25926fbe8dcSKarl Rupp 
2604dd39f65SShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_3;
2614dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3;
2620c4413a7SShri Abhyankar   C->assembled           = PETSC_TRUE;
26326fbe8dcSKarl Rupp 
2649566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
2650c4413a7SShri Abhyankar   PetscFunctionReturn(0);
2660c4413a7SShri Abhyankar }
2670c4413a7SShri Abhyankar 
268*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
269*d71ae5a4SJacob Faibussowitsch {
27017542729SShri Abhyankar   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
27117542729SShri Abhyankar   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
27217542729SShri Abhyankar   PetscInt    *ajtmpold, *ajtmp, nz, row;
27317542729SShri Abhyankar   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
27417542729SShri Abhyankar   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
27517542729SShri Abhyankar   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, m5, m6, m7, m8, m9, x1, x2, x3, x4;
27617542729SShri Abhyankar   MatScalar    p5, p6, p7, p8, p9, x5, x6, x7, x8, x9;
27717542729SShri Abhyankar   MatScalar   *ba = b->a, *aa = a->a;
278182b8fbaSHong Zhang   PetscReal    shift = info->shiftamount;
279a455e926SHong Zhang   PetscBool    allowzeropivot, zeropivotdetected;
28017542729SShri Abhyankar 
28117542729SShri Abhyankar   PetscFunctionBegin;
2829566063dSJacob Faibussowitsch   PetscCall(PetscMalloc1(9 * (n + 1), &rtmp));
2830164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
28417542729SShri Abhyankar 
28517542729SShri Abhyankar   for (i = 0; i < n; i++) {
28617542729SShri Abhyankar     nz    = bi[i + 1] - bi[i];
28717542729SShri Abhyankar     ajtmp = bj + bi[i];
28817542729SShri Abhyankar     for (j = 0; j < nz; j++) {
28917542729SShri Abhyankar       x    = rtmp + 9 * ajtmp[j];
29017542729SShri Abhyankar       x[0] = x[1] = x[2] = x[3] = x[4] = x[5] = x[6] = x[7] = x[8] = 0.0;
29117542729SShri Abhyankar     }
29217542729SShri Abhyankar     /* load in initial (unfactored row) */
29317542729SShri Abhyankar     nz       = ai[i + 1] - ai[i];
29417542729SShri Abhyankar     ajtmpold = aj + ai[i];
29517542729SShri Abhyankar     v        = aa + 9 * ai[i];
29617542729SShri Abhyankar     for (j = 0; j < nz; j++) {
29717542729SShri Abhyankar       x    = rtmp + 9 * ajtmpold[j];
2989371c9d4SSatish Balay       x[0] = v[0];
2999371c9d4SSatish Balay       x[1] = v[1];
3009371c9d4SSatish Balay       x[2] = v[2];
3019371c9d4SSatish Balay       x[3] = v[3];
3029371c9d4SSatish Balay       x[4] = v[4];
3039371c9d4SSatish Balay       x[5] = v[5];
3049371c9d4SSatish Balay       x[6] = v[6];
3059371c9d4SSatish Balay       x[7] = v[7];
3069371c9d4SSatish Balay       x[8] = v[8];
30717542729SShri Abhyankar       v += 9;
30817542729SShri Abhyankar     }
30917542729SShri Abhyankar     row = *ajtmp++;
31017542729SShri Abhyankar     while (row < i) {
31117542729SShri Abhyankar       pc = rtmp + 9 * row;
3129371c9d4SSatish Balay       p1 = pc[0];
3139371c9d4SSatish Balay       p2 = pc[1];
3149371c9d4SSatish Balay       p3 = pc[2];
3159371c9d4SSatish Balay       p4 = pc[3];
3169371c9d4SSatish Balay       p5 = pc[4];
3179371c9d4SSatish Balay       p6 = pc[5];
3189371c9d4SSatish Balay       p7 = pc[6];
3199371c9d4SSatish Balay       p8 = pc[7];
3209371c9d4SSatish Balay       p9 = pc[8];
3219371c9d4SSatish Balay       if (p1 != 0.0 || p2 != 0.0 || p3 != 0.0 || p4 != 0.0 || p5 != 0.0 || p6 != 0.0 || p7 != 0.0 || p8 != 0.0 || p9 != 0.0) {
32217542729SShri Abhyankar         pv    = ba + 9 * diag_offset[row];
32317542729SShri Abhyankar         pj    = bj + diag_offset[row] + 1;
3249371c9d4SSatish Balay         x1    = pv[0];
3259371c9d4SSatish Balay         x2    = pv[1];
3269371c9d4SSatish Balay         x3    = pv[2];
3279371c9d4SSatish Balay         x4    = pv[3];
3289371c9d4SSatish Balay         x5    = pv[4];
3299371c9d4SSatish Balay         x6    = pv[5];
3309371c9d4SSatish Balay         x7    = pv[6];
3319371c9d4SSatish Balay         x8    = pv[7];
3329371c9d4SSatish Balay         x9    = pv[8];
33317542729SShri Abhyankar         pc[0] = m1 = p1 * x1 + p4 * x2 + p7 * x3;
33417542729SShri Abhyankar         pc[1] = m2 = p2 * x1 + p5 * x2 + p8 * x3;
33517542729SShri Abhyankar         pc[2] = m3 = p3 * x1 + p6 * x2 + p9 * x3;
33617542729SShri Abhyankar 
33717542729SShri Abhyankar         pc[3] = m4 = p1 * x4 + p4 * x5 + p7 * x6;
33817542729SShri Abhyankar         pc[4] = m5 = p2 * x4 + p5 * x5 + p8 * x6;
33917542729SShri Abhyankar         pc[5] = m6 = p3 * x4 + p6 * x5 + p9 * x6;
34017542729SShri Abhyankar 
34117542729SShri Abhyankar         pc[6] = m7 = p1 * x7 + p4 * x8 + p7 * x9;
34217542729SShri Abhyankar         pc[7] = m8 = p2 * x7 + p5 * x8 + p8 * x9;
34317542729SShri Abhyankar         pc[8] = m9 = p3 * x7 + p6 * x8 + p9 * x9;
34417542729SShri Abhyankar 
34517542729SShri Abhyankar         nz = bi[row + 1] - diag_offset[row] - 1;
34617542729SShri Abhyankar         pv += 9;
34717542729SShri Abhyankar         for (j = 0; j < nz; j++) {
3489371c9d4SSatish Balay           x1 = pv[0];
3499371c9d4SSatish Balay           x2 = pv[1];
3509371c9d4SSatish Balay           x3 = pv[2];
3519371c9d4SSatish Balay           x4 = pv[3];
3529371c9d4SSatish Balay           x5 = pv[4];
3539371c9d4SSatish Balay           x6 = pv[5];
3549371c9d4SSatish Balay           x7 = pv[6];
3559371c9d4SSatish Balay           x8 = pv[7];
3569371c9d4SSatish Balay           x9 = pv[8];
35717542729SShri Abhyankar           x  = rtmp + 9 * pj[j];
35817542729SShri Abhyankar           x[0] -= m1 * x1 + m4 * x2 + m7 * x3;
35917542729SShri Abhyankar           x[1] -= m2 * x1 + m5 * x2 + m8 * x3;
36017542729SShri Abhyankar           x[2] -= m3 * x1 + m6 * x2 + m9 * x3;
36117542729SShri Abhyankar 
36217542729SShri Abhyankar           x[3] -= m1 * x4 + m4 * x5 + m7 * x6;
36317542729SShri Abhyankar           x[4] -= m2 * x4 + m5 * x5 + m8 * x6;
36417542729SShri Abhyankar           x[5] -= m3 * x4 + m6 * x5 + m9 * x6;
36517542729SShri Abhyankar 
36617542729SShri Abhyankar           x[6] -= m1 * x7 + m4 * x8 + m7 * x9;
36717542729SShri Abhyankar           x[7] -= m2 * x7 + m5 * x8 + m8 * x9;
36817542729SShri Abhyankar           x[8] -= m3 * x7 + m6 * x8 + m9 * x9;
36917542729SShri Abhyankar           pv += 9;
37017542729SShri Abhyankar         }
3719566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(54.0 * nz + 36.0));
37217542729SShri Abhyankar       }
37317542729SShri Abhyankar       row = *ajtmp++;
37417542729SShri Abhyankar     }
37517542729SShri Abhyankar     /* finished row so stick it into b->a */
37617542729SShri Abhyankar     pv = ba + 9 * bi[i];
37717542729SShri Abhyankar     pj = bj + bi[i];
37817542729SShri Abhyankar     nz = bi[i + 1] - bi[i];
37917542729SShri Abhyankar     for (j = 0; j < nz; j++) {
38017542729SShri Abhyankar       x     = rtmp + 9 * pj[j];
3819371c9d4SSatish Balay       pv[0] = x[0];
3829371c9d4SSatish Balay       pv[1] = x[1];
3839371c9d4SSatish Balay       pv[2] = x[2];
3849371c9d4SSatish Balay       pv[3] = x[3];
3859371c9d4SSatish Balay       pv[4] = x[4];
3869371c9d4SSatish Balay       pv[5] = x[5];
3879371c9d4SSatish Balay       pv[6] = x[6];
3889371c9d4SSatish Balay       pv[7] = x[7];
3899371c9d4SSatish Balay       pv[8] = x[8];
39017542729SShri Abhyankar       pv += 9;
39117542729SShri Abhyankar     }
39217542729SShri Abhyankar     /* invert diagonal block */
39317542729SShri Abhyankar     w = ba + 9 * diag_offset[i];
3949566063dSJacob Faibussowitsch     PetscCall(PetscKernel_A_gets_inverse_A_3(w, shift, allowzeropivot, &zeropivotdetected));
3957b6c816cSBarry Smith     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
39617542729SShri Abhyankar   }
39717542729SShri Abhyankar 
3989566063dSJacob Faibussowitsch   PetscCall(PetscFree(rtmp));
39926fbe8dcSKarl Rupp 
40006e38f1dSHong Zhang   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering_inplace;
40106e38f1dSHong Zhang   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace;
40217542729SShri Abhyankar   C->assembled           = PETSC_TRUE;
40326fbe8dcSKarl Rupp 
4049566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * b->mbs)); /* from inverting diagonal blocks */
40517542729SShri Abhyankar   PetscFunctionReturn(0);
40617542729SShri Abhyankar }
40717542729SShri Abhyankar 
40817542729SShri Abhyankar /*
4094dd39f65SShri Abhyankar   MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering -
4104dd39f65SShri Abhyankar     copied from MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace()
41117542729SShri Abhyankar */
412*d71ae5a4SJacob Faibussowitsch PetscErrorCode MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
413*d71ae5a4SJacob Faibussowitsch {
41417542729SShri Abhyankar   Mat             C = B;
41517542729SShri Abhyankar   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
416bbd65245SShri Abhyankar   PetscInt        i, j, k, nz, nzL, row;
417bbd65245SShri Abhyankar   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
418bbd65245SShri Abhyankar   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag, *pj, bs2 = a->bs2;
41917542729SShri Abhyankar   MatScalar      *rtmp, *pc, *mwork, *v, *pv, *aa = a->a;
420bbd65245SShri Abhyankar   PetscInt        flg;
421182b8fbaSHong Zhang   PetscReal       shift = info->shiftamount;
422a455e926SHong Zhang   PetscBool       allowzeropivot, zeropivotdetected;
42317542729SShri Abhyankar 
42417542729SShri Abhyankar   PetscFunctionBegin;
4250164db54SHong Zhang   allowzeropivot = PetscNot(A->erroriffailure);
4260164db54SHong Zhang 
42717542729SShri Abhyankar   /* generate work space needed by the factorization */
4289566063dSJacob Faibussowitsch   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
4299566063dSJacob Faibussowitsch   PetscCall(PetscArrayzero(rtmp, bs2 * n));
43017542729SShri Abhyankar 
43117542729SShri Abhyankar   for (i = 0; i < n; i++) {
43217542729SShri Abhyankar     /* zero rtmp */
43317542729SShri Abhyankar     /* L part */
43417542729SShri Abhyankar     nz    = bi[i + 1] - bi[i];
43517542729SShri Abhyankar     bjtmp = bj + bi[i];
43648a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
43717542729SShri Abhyankar 
43817542729SShri Abhyankar     /* U part */
439b2b2dd24SShri Abhyankar     nz    = bdiag[i] - bdiag[i + 1];
440b2b2dd24SShri Abhyankar     bjtmp = bj + bdiag[i + 1] + 1;
44148a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
442b2b2dd24SShri Abhyankar 
443b2b2dd24SShri Abhyankar     /* load in initial (unfactored row) */
444b2b2dd24SShri Abhyankar     nz    = ai[i + 1] - ai[i];
445b2b2dd24SShri Abhyankar     ajtmp = aj + ai[i];
446b2b2dd24SShri Abhyankar     v     = aa + bs2 * ai[i];
44748a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
448b2b2dd24SShri Abhyankar 
449b2b2dd24SShri Abhyankar     /* elimination */
450b2b2dd24SShri Abhyankar     bjtmp = bj + bi[i];
451b2b2dd24SShri Abhyankar     nzL   = bi[i + 1] - bi[i];
452b2b2dd24SShri Abhyankar     for (k = 0; k < nzL; k++) {
453b2b2dd24SShri Abhyankar       row = bjtmp[k];
454b2b2dd24SShri Abhyankar       pc  = rtmp + bs2 * row;
455c35f09e5SBarry Smith       for (flg = 0, j = 0; j < bs2; j++) {
456c35f09e5SBarry Smith         if (pc[j] != 0.0) {
457c35f09e5SBarry Smith           flg = 1;
458c35f09e5SBarry Smith           break;
459c35f09e5SBarry Smith         }
460c35f09e5SBarry Smith       }
461b2b2dd24SShri Abhyankar       if (flg) {
462b2b2dd24SShri Abhyankar         pv = b->a + bs2 * bdiag[row];
46396b95a6bSBarry Smith         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
4649566063dSJacob Faibussowitsch         PetscCall(PetscKernel_A_gets_A_times_B_3(pc, pv, mwork));
465b2b2dd24SShri Abhyankar 
466b2b2dd24SShri Abhyankar         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
467b2b2dd24SShri Abhyankar         pv = b->a + bs2 * (bdiag[row + 1] + 1);
468b2b2dd24SShri Abhyankar         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
469b2b2dd24SShri Abhyankar         for (j = 0; j < nz; j++) {
47096b95a6bSBarry Smith           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
471b2b2dd24SShri Abhyankar           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
472b2b2dd24SShri Abhyankar           v = rtmp + bs2 * pj[j];
4739566063dSJacob Faibussowitsch           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_3(v, pc, pv));
474b2b2dd24SShri Abhyankar           pv += bs2;
475b2b2dd24SShri Abhyankar         }
4769566063dSJacob Faibussowitsch         PetscCall(PetscLogFlops(54.0 * nz + 45)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
477b2b2dd24SShri Abhyankar       }
478b2b2dd24SShri Abhyankar     }
479b2b2dd24SShri Abhyankar 
480b2b2dd24SShri Abhyankar     /* finished row so stick it into b->a */
481b2b2dd24SShri Abhyankar     /* L part */
482b2b2dd24SShri Abhyankar     pv = b->a + bs2 * bi[i];
483b2b2dd24SShri Abhyankar     pj = b->j + bi[i];
484b2b2dd24SShri Abhyankar     nz = bi[i + 1] - bi[i];
48548a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
486b2b2dd24SShri Abhyankar 
487a5b23f4aSJose E. Roman     /* Mark diagonal and invert diagonal for simpler triangular solves */
488b2b2dd24SShri Abhyankar     pv = b->a + bs2 * bdiag[i];
489b2b2dd24SShri Abhyankar     pj = b->j + bdiag[i];
4909566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
4919566063dSJacob Faibussowitsch     PetscCall(PetscKernel_A_gets_inverse_A_3(pv, shift, allowzeropivot, &zeropivotdetected));
4927b6c816cSBarry Smith     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
493b2b2dd24SShri Abhyankar 
494b2b2dd24SShri Abhyankar     /* U part */
495b2b2dd24SShri Abhyankar     pv = b->a + bs2 * (bdiag[i + 1] + 1);
496b2b2dd24SShri Abhyankar     pj = b->j + bdiag[i + 1] + 1;
497b2b2dd24SShri Abhyankar     nz = bdiag[i] - bdiag[i + 1] - 1;
49848a46eb9SPierre Jolivet     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
499b2b2dd24SShri Abhyankar   }
5009566063dSJacob Faibussowitsch   PetscCall(PetscFree2(rtmp, mwork));
50126fbe8dcSKarl Rupp 
5024dd39f65SShri Abhyankar   C->ops->solve          = MatSolve_SeqBAIJ_3_NaturalOrdering;
5039f5c0bcdSBarry Smith   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_3_NaturalOrdering;
5049f5c0bcdSBarry Smith   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_3_NaturalOrdering;
5054dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
506b2b2dd24SShri Abhyankar   C->assembled           = PETSC_TRUE;
50726fbe8dcSKarl Rupp 
5089566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(1.333333333333 * 3 * 3 * 3 * n)); /* from inverting diagonal blocks */
509b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
510b2b2dd24SShri Abhyankar }
511