xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision e0f5bfbec699682fa3e8b8532b1176849ea4e12a)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info) {
9   Mat             C = B;
10   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
11   IS              isrow = b->row, isicol = b->icol;
12   const PetscInt *r, *ic;
13   PetscInt        i, j, k, nz, nzL, row, *pj;
14   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
15   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
16   MatScalar      *rtmp, *pc, *mwork, *pv;
17   MatScalar      *aa = a->a, *v;
18   PetscInt        flg;
19   PetscReal       shift = info->shiftamount;
20   PetscBool       allowzeropivot, zeropivotdetected;
21 
22   PetscFunctionBegin;
23   PetscCall(ISGetIndices(isrow, &r));
24   PetscCall(ISGetIndices(isicol, &ic));
25   allowzeropivot = PetscNot(A->erroriffailure);
26 
27   /* generate work space needed by the factorization */
28   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
29   PetscCall(PetscArrayzero(rtmp, bs2 * n));
30 
31   for (i = 0; i < n; i++) {
32     /* zero rtmp */
33     /* L part */
34     nz    = bi[i + 1] - bi[i];
35     bjtmp = bj + bi[i];
36     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
37 
38     /* U part */
39     nz    = bdiag[i] - bdiag[i + 1];
40     bjtmp = bj + bdiag[i + 1] + 1;
41     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
42 
43     /* load in initial (unfactored row) */
44     nz    = ai[r[i] + 1] - ai[r[i]];
45     ajtmp = aj + ai[r[i]];
46     v     = aa + bs2 * ai[r[i]];
47     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
48 
49     /* elimination */
50     bjtmp = bj + bi[i];
51     nzL   = bi[i + 1] - bi[i];
52     for (k = 0; k < nzL; k++) {
53       row = bjtmp[k];
54       pc  = rtmp + bs2 * row;
55       for (flg = 0, j = 0; j < bs2; j++) {
56         if (pc[j] != (PetscScalar)0.0) {
57           flg = 1;
58           break;
59         }
60       }
61       if (flg) {
62         pv = b->a + bs2 * bdiag[row];
63         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
64         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
65 
66         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
67         pv = b->a + bs2 * (bdiag[row + 1] + 1);
68         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
69         for (j = 0; j < nz; j++) {
70           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
71           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
72           v = rtmp + 4 * pj[j];
73           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
74           pv += 4;
75         }
76         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
77       }
78     }
79 
80     /* finished row so stick it into b->a */
81     /* L part */
82     pv = b->a + bs2 * bi[i];
83     pj = b->j + bi[i];
84     nz = bi[i + 1] - bi[i];
85     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
86 
87     /* Mark diagonal and invert diagonal for simpler triangular solves */
88     pv = b->a + bs2 * bdiag[i];
89     pj = b->j + bdiag[i];
90     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
91     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
92     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
93 
94     /* U part */
95     pv = b->a + bs2 * (bdiag[i + 1] + 1);
96     pj = b->j + bdiag[i + 1] + 1;
97     nz = bdiag[i] - bdiag[i + 1] - 1;
98     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
99   }
100 
101   PetscCall(PetscFree2(rtmp, mwork));
102   PetscCall(ISRestoreIndices(isicol, &ic));
103   PetscCall(ISRestoreIndices(isrow, &r));
104 
105   C->ops->solve          = MatSolve_SeqBAIJ_2;
106   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
107   C->assembled           = PETSC_TRUE;
108 
109   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
110   PetscFunctionReturn(0);
111 }
112 
113 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info) {
114   Mat             C = B;
115   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
116   PetscInt        i, j, k, nz, nzL, row, *pj;
117   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
118   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
119   MatScalar      *rtmp, *pc, *mwork, *pv;
120   MatScalar      *aa = a->a, *v;
121   PetscInt        flg;
122   PetscReal       shift = info->shiftamount;
123   PetscBool       allowzeropivot, zeropivotdetected;
124 
125   PetscFunctionBegin;
126   allowzeropivot = PetscNot(A->erroriffailure);
127 
128   /* generate work space needed by the factorization */
129   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
130   PetscCall(PetscArrayzero(rtmp, bs2 * n));
131 
132   for (i = 0; i < n; i++) {
133     /* zero rtmp */
134     /* L part */
135     nz    = bi[i + 1] - bi[i];
136     bjtmp = bj + bi[i];
137     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
138 
139     /* U part */
140     nz    = bdiag[i] - bdiag[i + 1];
141     bjtmp = bj + bdiag[i + 1] + 1;
142     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
143 
144     /* load in initial (unfactored row) */
145     nz    = ai[i + 1] - ai[i];
146     ajtmp = aj + ai[i];
147     v     = aa + bs2 * ai[i];
148     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
149 
150     /* elimination */
151     bjtmp = bj + bi[i];
152     nzL   = bi[i + 1] - bi[i];
153     for (k = 0; k < nzL; k++) {
154       row = bjtmp[k];
155       pc  = rtmp + bs2 * row;
156       for (flg = 0, j = 0; j < bs2; j++) {
157         if (pc[j] != (PetscScalar)0.0) {
158           flg = 1;
159           break;
160         }
161       }
162       if (flg) {
163         pv = b->a + bs2 * bdiag[row];
164         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
165         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
166 
167         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
168         pv = b->a + bs2 * (bdiag[row + 1] + 1);
169         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
170         for (j = 0; j < nz; j++) {
171           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
172           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
173           v = rtmp + 4 * pj[j];
174           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
175           pv += 4;
176         }
177         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
178       }
179     }
180 
181     /* finished row so stick it into b->a */
182     /* L part */
183     pv = b->a + bs2 * bi[i];
184     pj = b->j + bi[i];
185     nz = bi[i + 1] - bi[i];
186     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
187 
188     /* Mark diagonal and invert diagonal for simpler triangular solves */
189     pv = b->a + bs2 * bdiag[i];
190     pj = b->j + bdiag[i];
191     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
192     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
193     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
194 
195     /* U part */
196     /*
197     pv = b->a + bs2*bi[2*n-i];
198     pj = b->j + bi[2*n-i];
199     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
200     */
201     pv = b->a + bs2 * (bdiag[i + 1] + 1);
202     pj = b->j + bdiag[i + 1] + 1;
203     nz = bdiag[i] - bdiag[i + 1] - 1;
204     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
205   }
206   PetscCall(PetscFree2(rtmp, mwork));
207 
208   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
209   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
210   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
211   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
212   C->assembled           = PETSC_TRUE;
213 
214   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
215   PetscFunctionReturn(0);
216 }
217 
218 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info) {
219   Mat             C = B;
220   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
221   IS              isrow = b->row, isicol = b->icol;
222   const PetscInt *r, *ic;
223   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
224   PetscInt       *ajtmpold, *ajtmp, nz, row;
225   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
226   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
227   MatScalar       p1, p2, p3, p4;
228   MatScalar      *ba = b->a, *aa = a->a;
229   PetscReal       shift = info->shiftamount;
230   PetscBool       allowzeropivot, zeropivotdetected;
231 
232   PetscFunctionBegin;
233   allowzeropivot = PetscNot(A->erroriffailure);
234   PetscCall(ISGetIndices(isrow, &r));
235   PetscCall(ISGetIndices(isicol, &ic));
236   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
237 
238   for (i = 0; i < n; i++) {
239     nz    = bi[i + 1] - bi[i];
240     ajtmp = bj + bi[i];
241     for (j = 0; j < nz; j++) {
242       x    = rtmp + 4 * ajtmp[j];
243       x[0] = x[1] = x[2] = x[3] = 0.0;
244     }
245     /* load in initial (unfactored row) */
246     idx      = r[i];
247     nz       = ai[idx + 1] - ai[idx];
248     ajtmpold = aj + ai[idx];
249     v        = aa + 4 * ai[idx];
250     for (j = 0; j < nz; j++) {
251       x    = rtmp + 4 * ic[ajtmpold[j]];
252       x[0] = v[0];
253       x[1] = v[1];
254       x[2] = v[2];
255       x[3] = v[3];
256       v += 4;
257     }
258     row = *ajtmp++;
259     while (row < i) {
260       pc = rtmp + 4 * row;
261       p1 = pc[0];
262       p2 = pc[1];
263       p3 = pc[2];
264       p4 = pc[3];
265       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
266         pv    = ba + 4 * diag_offset[row];
267         pj    = bj + diag_offset[row] + 1;
268         x1    = pv[0];
269         x2    = pv[1];
270         x3    = pv[2];
271         x4    = pv[3];
272         pc[0] = m1 = p1 * x1 + p3 * x2;
273         pc[1] = m2 = p2 * x1 + p4 * x2;
274         pc[2] = m3 = p1 * x3 + p3 * x4;
275         pc[3] = m4 = p2 * x3 + p4 * x4;
276         nz         = bi[row + 1] - diag_offset[row] - 1;
277         pv += 4;
278         for (j = 0; j < nz; j++) {
279           x1 = pv[0];
280           x2 = pv[1];
281           x3 = pv[2];
282           x4 = pv[3];
283           x  = rtmp + 4 * pj[j];
284           x[0] -= m1 * x1 + m3 * x2;
285           x[1] -= m2 * x1 + m4 * x2;
286           x[2] -= m1 * x3 + m3 * x4;
287           x[3] -= m2 * x3 + m4 * x4;
288           pv += 4;
289         }
290         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
291       }
292       row = *ajtmp++;
293     }
294     /* finished row so stick it into b->a */
295     pv = ba + 4 * bi[i];
296     pj = bj + bi[i];
297     nz = bi[i + 1] - bi[i];
298     for (j = 0; j < nz; j++) {
299       x     = rtmp + 4 * pj[j];
300       pv[0] = x[0];
301       pv[1] = x[1];
302       pv[2] = x[2];
303       pv[3] = x[3];
304       pv += 4;
305     }
306     /* invert diagonal block */
307     w = ba + 4 * diag_offset[i];
308     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
309     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
310   }
311 
312   PetscCall(PetscFree(rtmp));
313   PetscCall(ISRestoreIndices(isicol, &ic));
314   PetscCall(ISRestoreIndices(isrow, &r));
315 
316   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
317   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
318   C->assembled           = PETSC_TRUE;
319 
320   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
321   PetscFunctionReturn(0);
322 }
323 /*
324       Version for when blocks are 2 by 2 Using natural ordering
325 */
326 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info) {
327   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
328   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
329   PetscInt    *ajtmpold, *ajtmp, nz, row;
330   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
331   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
332   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
333   MatScalar   *ba = b->a, *aa = a->a;
334   PetscReal    shift = info->shiftamount;
335   PetscBool    allowzeropivot, zeropivotdetected;
336 
337   PetscFunctionBegin;
338   allowzeropivot = PetscNot(A->erroriffailure);
339   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
340   for (i = 0; i < n; i++) {
341     nz    = bi[i + 1] - bi[i];
342     ajtmp = bj + bi[i];
343     for (j = 0; j < nz; j++) {
344       x    = rtmp + 4 * ajtmp[j];
345       x[0] = x[1] = x[2] = x[3] = 0.0;
346     }
347     /* load in initial (unfactored row) */
348     nz       = ai[i + 1] - ai[i];
349     ajtmpold = aj + ai[i];
350     v        = aa + 4 * ai[i];
351     for (j = 0; j < nz; j++) {
352       x    = rtmp + 4 * ajtmpold[j];
353       x[0] = v[0];
354       x[1] = v[1];
355       x[2] = v[2];
356       x[3] = v[3];
357       v += 4;
358     }
359     row = *ajtmp++;
360     while (row < i) {
361       pc = rtmp + 4 * row;
362       p1 = pc[0];
363       p2 = pc[1];
364       p3 = pc[2];
365       p4 = pc[3];
366       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
367         pv    = ba + 4 * diag_offset[row];
368         pj    = bj + diag_offset[row] + 1;
369         x1    = pv[0];
370         x2    = pv[1];
371         x3    = pv[2];
372         x4    = pv[3];
373         pc[0] = m1 = p1 * x1 + p3 * x2;
374         pc[1] = m2 = p2 * x1 + p4 * x2;
375         pc[2] = m3 = p1 * x3 + p3 * x4;
376         pc[3] = m4 = p2 * x3 + p4 * x4;
377         nz         = bi[row + 1] - diag_offset[row] - 1;
378         pv += 4;
379         for (j = 0; j < nz; j++) {
380           x1 = pv[0];
381           x2 = pv[1];
382           x3 = pv[2];
383           x4 = pv[3];
384           x  = rtmp + 4 * pj[j];
385           x[0] -= m1 * x1 + m3 * x2;
386           x[1] -= m2 * x1 + m4 * x2;
387           x[2] -= m1 * x3 + m3 * x4;
388           x[3] -= m2 * x3 + m4 * x4;
389           pv += 4;
390         }
391         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
392       }
393       row = *ajtmp++;
394     }
395     /* finished row so stick it into b->a */
396     pv = ba + 4 * bi[i];
397     pj = bj + bi[i];
398     nz = bi[i + 1] - bi[i];
399     for (j = 0; j < nz; j++) {
400       x     = rtmp + 4 * pj[j];
401       pv[0] = x[0];
402       pv[1] = x[1];
403       pv[2] = x[2];
404       pv[3] = x[3];
405       /*
406       printf(" col %d:",pj[j]);
407       PetscInt j1;
408       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
409       printf("\n");
410       */
411       pv += 4;
412     }
413     /* invert diagonal block */
414     w = ba + 4 * diag_offset[i];
415     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
416     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
417   }
418 
419   PetscCall(PetscFree(rtmp));
420 
421   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
422   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
423   C->assembled           = PETSC_TRUE;
424 
425   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
426   PetscFunctionReturn(0);
427 }
428 
429 /* ----------------------------------------------------------- */
430 /*
431      Version for when blocks are 1 by 1.
432 */
433 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info) {
434   Mat              C = B;
435   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
436   IS               isrow = b->row, isicol = b->icol;
437   const PetscInt  *r, *ic, *ics;
438   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
439   PetscInt         i, j, k, nz, nzL, row, *pj;
440   const PetscInt  *ajtmp, *bjtmp;
441   MatScalar       *rtmp, *pc, multiplier, *pv;
442   const MatScalar *aa = a->a, *v;
443   PetscBool        row_identity, col_identity;
444   FactorShiftCtx   sctx;
445   const PetscInt  *ddiag;
446   PetscReal        rs;
447   MatScalar        d;
448 
449   PetscFunctionBegin;
450   /* MatPivotSetUp(): initialize shift context sctx */
451   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
452 
453   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
454     ddiag          = a->diag;
455     sctx.shift_top = info->zeropivot;
456     for (i = 0; i < n; i++) {
457       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
458       d  = (aa)[ddiag[i]];
459       rs = -PetscAbsScalar(d) - PetscRealPart(d);
460       v  = aa + ai[i];
461       nz = ai[i + 1] - ai[i];
462       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
463       if (rs > sctx.shift_top) sctx.shift_top = rs;
464     }
465     sctx.shift_top *= 1.1;
466     sctx.nshift_max = 5;
467     sctx.shift_lo   = 0.;
468     sctx.shift_hi   = 1.;
469   }
470 
471   PetscCall(ISGetIndices(isrow, &r));
472   PetscCall(ISGetIndices(isicol, &ic));
473   PetscCall(PetscMalloc1(n + 1, &rtmp));
474   ics = ic;
475 
476   do {
477     sctx.newshift = PETSC_FALSE;
478     for (i = 0; i < n; i++) {
479       /* zero rtmp */
480       /* L part */
481       nz    = bi[i + 1] - bi[i];
482       bjtmp = bj + bi[i];
483       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
484 
485       /* U part */
486       nz    = bdiag[i] - bdiag[i + 1];
487       bjtmp = bj + bdiag[i + 1] + 1;
488       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
489 
490       /* load in initial (unfactored row) */
491       nz    = ai[r[i] + 1] - ai[r[i]];
492       ajtmp = aj + ai[r[i]];
493       v     = aa + ai[r[i]];
494       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
495 
496       /* ZeropivotApply() */
497       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
498 
499       /* elimination */
500       bjtmp = bj + bi[i];
501       row   = *bjtmp++;
502       nzL   = bi[i + 1] - bi[i];
503       for (k = 0; k < nzL; k++) {
504         pc = rtmp + row;
505         if (*pc != (PetscScalar)0.0) {
506           pv         = b->a + bdiag[row];
507           multiplier = *pc * (*pv);
508           *pc        = multiplier;
509 
510           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
511           pv = b->a + bdiag[row + 1] + 1;
512           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
513           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
514           PetscCall(PetscLogFlops(2.0 * nz));
515         }
516         row = *bjtmp++;
517       }
518 
519       /* finished row so stick it into b->a */
520       rs = 0.0;
521       /* L part */
522       pv = b->a + bi[i];
523       pj = b->j + bi[i];
524       nz = bi[i + 1] - bi[i];
525       for (j = 0; j < nz; j++) {
526         pv[j] = rtmp[pj[j]];
527         rs += PetscAbsScalar(pv[j]);
528       }
529 
530       /* U part */
531       pv = b->a + bdiag[i + 1] + 1;
532       pj = b->j + bdiag[i + 1] + 1;
533       nz = bdiag[i] - bdiag[i + 1] - 1;
534       for (j = 0; j < nz; j++) {
535         pv[j] = rtmp[pj[j]];
536         rs += PetscAbsScalar(pv[j]);
537       }
538 
539       sctx.rs = rs;
540       sctx.pv = rtmp[i];
541       PetscCall(MatPivotCheck(B, A, info, &sctx, i));
542       if (sctx.newshift) break; /* break for-loop */
543       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
544 
545       /* Mark diagonal and invert diagonal for simpler triangular solves */
546       pv  = b->a + bdiag[i];
547       *pv = (PetscScalar)1.0 / rtmp[i];
548 
549     } /* endof for (i=0; i<n; i++) { */
550 
551     /* MatPivotRefine() */
552     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
553       /*
554        * if no shift in this attempt & shifting & started shifting & can refine,
555        * then try lower shift
556        */
557       sctx.shift_hi       = sctx.shift_fraction;
558       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
559       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
560       sctx.newshift       = PETSC_TRUE;
561       sctx.nshift++;
562     }
563   } while (sctx.newshift);
564 
565   PetscCall(PetscFree(rtmp));
566   PetscCall(ISRestoreIndices(isicol, &ic));
567   PetscCall(ISRestoreIndices(isrow, &r));
568 
569   PetscCall(ISIdentity(isrow, &row_identity));
570   PetscCall(ISIdentity(isicol, &col_identity));
571   if (row_identity && col_identity) {
572     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
573     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
574     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
575     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
576   } else {
577     C->ops->solve          = MatSolve_SeqBAIJ_1;
578     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
579   }
580   C->assembled = PETSC_TRUE;
581   PetscCall(PetscLogFlops(C->cmap->n));
582 
583   /* MatShiftView(A,info,&sctx) */
584   if (sctx.nshift) {
585     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
586       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
587     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
588       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
589     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
590       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
591     }
592   }
593   PetscFunctionReturn(0);
594 }
595 
596 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info) {
597   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
598   IS              isrow = b->row, isicol = b->icol;
599   const PetscInt *r, *ic;
600   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
601   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
602   PetscInt       *diag_offset = b->diag, diag, *pj;
603   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
604   MatScalar      *ba = b->a, *aa = a->a;
605   PetscBool       row_identity, col_identity;
606 
607   PetscFunctionBegin;
608   PetscCall(ISGetIndices(isrow, &r));
609   PetscCall(ISGetIndices(isicol, &ic));
610   PetscCall(PetscMalloc1(n + 1, &rtmp));
611 
612   for (i = 0; i < n; i++) {
613     nz    = bi[i + 1] - bi[i];
614     ajtmp = bj + bi[i];
615     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
616 
617     /* load in initial (unfactored row) */
618     nz       = ai[r[i] + 1] - ai[r[i]];
619     ajtmpold = aj + ai[r[i]];
620     v        = aa + ai[r[i]];
621     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
622 
623     row = *ajtmp++;
624     while (row < i) {
625       pc = rtmp + row;
626       if (*pc != 0.0) {
627         pv         = ba + diag_offset[row];
628         pj         = bj + diag_offset[row] + 1;
629         multiplier = *pc * *pv++;
630         *pc        = multiplier;
631         nz         = bi[row + 1] - diag_offset[row] - 1;
632         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
633         PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
634       }
635       row = *ajtmp++;
636     }
637     /* finished row so stick it into b->a */
638     pv = ba + bi[i];
639     pj = bj + bi[i];
640     nz = bi[i + 1] - bi[i];
641     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
642     diag = diag_offset[i] - bi[i];
643     /* check pivot entry for current row */
644     PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
645     pv[diag] = 1.0 / pv[diag];
646   }
647 
648   PetscCall(PetscFree(rtmp));
649   PetscCall(ISRestoreIndices(isicol, &ic));
650   PetscCall(ISRestoreIndices(isrow, &r));
651   PetscCall(ISIdentity(isrow, &row_identity));
652   PetscCall(ISIdentity(isicol, &col_identity));
653   if (row_identity && col_identity) {
654     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
655     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
656   } else {
657     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
658     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
659   }
660   C->assembled = PETSC_TRUE;
661   PetscCall(PetscLogFlops(C->cmap->n));
662   PetscFunctionReturn(0);
663 }
664 
665 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type) {
666   PetscFunctionBegin;
667   *type = MATSOLVERPETSC;
668   PetscFunctionReturn(0);
669 }
670 
671 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B) {
672   PetscInt n = A->rmap->n;
673 
674   PetscFunctionBegin;
675 #if defined(PETSC_USE_COMPLEX)
676   PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
677 #endif
678   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
679   PetscCall(MatSetSizes(*B, n, n, n, n));
680   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
681     PetscCall(MatSetType(*B, MATSEQBAIJ));
682 
683     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
684     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
685     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
686     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
687     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
688   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
689     PetscCall(MatSetType(*B, MATSEQSBAIJ));
690     PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
691 
692     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
693     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
694     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
695     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
696     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
697   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
698   (*B)->factortype     = ftype;
699   (*B)->canuseordering = PETSC_TRUE;
700 
701   PetscCall(PetscFree((*B)->solvertype));
702   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
703   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
704   PetscFunctionReturn(0);
705 }
706 
707 /* ----------------------------------------------------------- */
708 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info) {
709   Mat C;
710 
711   PetscFunctionBegin;
712   PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
713   PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
714   PetscCall(MatLUFactorNumeric(C, A, info));
715 
716   A->ops->solve          = C->ops->solve;
717   A->ops->solvetranspose = C->ops->solvetranspose;
718 
719   PetscCall(MatHeaderMerge(A, &C));
720   PetscFunctionReturn(0);
721 }
722 
723 #include <../src/mat/impls/sbaij/seq/sbaij.h>
724 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info) {
725   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
726   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
727   IS              ip = b->row;
728   const PetscInt *rip;
729   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
730   PetscInt       *ai = a->i, *aj = a->j;
731   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
732   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
733   PetscReal       rs;
734   FactorShiftCtx  sctx;
735 
736   PetscFunctionBegin;
737   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
738     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
739     PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info));
740     PetscCall(MatDestroy(&a->sbaijMat));
741     PetscFunctionReturn(0);
742   }
743 
744   /* MatPivotSetUp(): initialize shift context sctx */
745   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
746 
747   PetscCall(ISGetIndices(ip, &rip));
748   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
749 
750   sctx.shift_amount = 0.;
751   sctx.nshift       = 0;
752   do {
753     sctx.newshift = PETSC_FALSE;
754     for (i = 0; i < mbs; i++) {
755       rtmp[i] = 0.0;
756       jl[i]   = mbs;
757       il[0]   = 0;
758     }
759 
760     for (k = 0; k < mbs; k++) {
761       bval = ba + bi[k];
762       /* initialize k-th row by the perm[k]-th row of A */
763       jmin = ai[rip[k]];
764       jmax = ai[rip[k] + 1];
765       for (j = jmin; j < jmax; j++) {
766         col = rip[aj[j]];
767         if (col >= k) { /* only take upper triangular entry */
768           rtmp[col] = aa[j];
769           *bval++   = 0.0; /* for in-place factorization */
770         }
771       }
772 
773       /* shift the diagonal of the matrix */
774       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
775 
776       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
777       dk = rtmp[k];
778       i  = jl[k]; /* first row to be added to k_th row  */
779 
780       while (i < k) {
781         nexti = jl[i]; /* next row to be added to k_th row */
782 
783         /* compute multiplier, update diag(k) and U(i,k) */
784         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
785         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
786         dk += uikdi * ba[ili];
787         ba[ili] = uikdi; /* -U(i,k) */
788 
789         /* add multiple of row i to k-th row */
790         jmin = ili + 1;
791         jmax = bi[i + 1];
792         if (jmin < jmax) {
793           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
794           /* update il and jl for row i */
795           il[i] = jmin;
796           j     = bj[jmin];
797           jl[i] = jl[j];
798           jl[j] = i;
799         }
800         i = nexti;
801       }
802 
803       /* shift the diagonals when zero pivot is detected */
804       /* compute rs=sum of abs(off-diagonal) */
805       rs   = 0.0;
806       jmin = bi[k] + 1;
807       nz   = bi[k + 1] - jmin;
808       if (nz) {
809         bcol = bj + jmin;
810         while (nz--) {
811           rs += PetscAbsScalar(rtmp[*bcol]);
812           bcol++;
813         }
814       }
815 
816       sctx.rs = rs;
817       sctx.pv = dk;
818       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
819       if (sctx.newshift) break;
820       dk = sctx.pv;
821 
822       /* copy data into U(k,:) */
823       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
824       jmin      = bi[k] + 1;
825       jmax      = bi[k + 1];
826       if (jmin < jmax) {
827         for (j = jmin; j < jmax; j++) {
828           col       = bj[j];
829           ba[j]     = rtmp[col];
830           rtmp[col] = 0.0;
831         }
832         /* add the k-th row into il and jl */
833         il[k] = jmin;
834         i     = bj[jmin];
835         jl[k] = jl[i];
836         jl[i] = k;
837       }
838     }
839   } while (sctx.newshift);
840   PetscCall(PetscFree3(rtmp, il, jl));
841 
842   PetscCall(ISRestoreIndices(ip, &rip));
843 
844   C->assembled    = PETSC_TRUE;
845   C->preallocated = PETSC_TRUE;
846 
847   PetscCall(PetscLogFlops(C->rmap->N));
848   if (sctx.nshift) {
849     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
850       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
851     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
852       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
853     }
854   }
855   PetscFunctionReturn(0);
856 }
857 
858 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info) {
859   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
860   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
861   PetscInt       i, j, am = a->mbs;
862   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
863   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
864   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
865   PetscReal      rs;
866   FactorShiftCtx sctx;
867 
868   PetscFunctionBegin;
869   /* MatPivotSetUp(): initialize shift context sctx */
870   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
871 
872   PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
873 
874   do {
875     sctx.newshift = PETSC_FALSE;
876     for (i = 0; i < am; i++) {
877       rtmp[i] = 0.0;
878       jl[i]   = am;
879       il[0]   = 0;
880     }
881 
882     for (k = 0; k < am; k++) {
883       /* initialize k-th row with elements nonzero in row perm(k) of A */
884       nz   = ai[k + 1] - ai[k];
885       acol = aj + ai[k];
886       aval = aa + ai[k];
887       bval = ba + bi[k];
888       while (nz--) {
889         if (*acol < k) { /* skip lower triangular entries */
890           acol++;
891           aval++;
892         } else {
893           rtmp[*acol++] = *aval++;
894           *bval++       = 0.0; /* for in-place factorization */
895         }
896       }
897 
898       /* shift the diagonal of the matrix */
899       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
900 
901       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
902       dk = rtmp[k];
903       i  = jl[k]; /* first row to be added to k_th row  */
904 
905       while (i < k) {
906         nexti = jl[i]; /* next row to be added to k_th row */
907         /* compute multiplier, update D(k) and U(i,k) */
908         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
909         uikdi = -ba[ili] * ba[bi[i]];
910         dk += uikdi * ba[ili];
911         ba[ili] = uikdi; /* -U(i,k) */
912 
913         /* add multiple of row i to k-th row ... */
914         jmin = ili + 1;
915         nz   = bi[i + 1] - jmin;
916         if (nz > 0) {
917           bcol = bj + jmin;
918           bval = ba + jmin;
919           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
920           /* update il and jl for i-th row */
921           il[i] = jmin;
922           j     = bj[jmin];
923           jl[i] = jl[j];
924           jl[j] = i;
925         }
926         i = nexti;
927       }
928 
929       /* shift the diagonals when zero pivot is detected */
930       /* compute rs=sum of abs(off-diagonal) */
931       rs   = 0.0;
932       jmin = bi[k] + 1;
933       nz   = bi[k + 1] - jmin;
934       if (nz) {
935         bcol = bj + jmin;
936         while (nz--) {
937           rs += PetscAbsScalar(rtmp[*bcol]);
938           bcol++;
939         }
940       }
941 
942       sctx.rs = rs;
943       sctx.pv = dk;
944       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
945       if (sctx.newshift) break; /* sctx.shift_amount is updated */
946       dk = sctx.pv;
947 
948       /* copy data into U(k,:) */
949       ba[bi[k]] = 1.0 / dk;
950       jmin      = bi[k] + 1;
951       nz        = bi[k + 1] - jmin;
952       if (nz) {
953         bcol = bj + jmin;
954         bval = ba + jmin;
955         while (nz--) {
956           *bval++       = rtmp[*bcol];
957           rtmp[*bcol++] = 0.0;
958         }
959         /* add k-th row into il and jl */
960         il[k] = jmin;
961         i     = bj[jmin];
962         jl[k] = jl[i];
963         jl[i] = k;
964       }
965     }
966   } while (sctx.newshift);
967   PetscCall(PetscFree3(rtmp, il, jl));
968 
969   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
970   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
971   C->assembled           = PETSC_TRUE;
972   C->preallocated        = PETSC_TRUE;
973 
974   PetscCall(PetscLogFlops(C->rmap->N));
975   if (sctx.nshift) {
976     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
977       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
978     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
979       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
980     }
981   }
982   PetscFunctionReturn(0);
983 }
984 
985 #include <petscbt.h>
986 #include <../src/mat/utils/freespace.h>
987 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) {
988   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
989   Mat_SeqSBAIJ      *b;
990   Mat                B;
991   PetscBool          perm_identity, missing;
992   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
993   const PetscInt    *rip;
994   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
995   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
996   PetscReal          fill = info->fill, levels = info->levels;
997   PetscFreeSpaceList free_space = NULL, current_space = NULL;
998   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
999   PetscBT            lnkbt;
1000 
1001   PetscFunctionBegin;
1002   PetscCall(MatMissingDiagonal(A, &missing, &i));
1003   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1004 
1005   if (bs > 1) {
1006     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1007     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1008 
1009     PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1010     PetscFunctionReturn(0);
1011   }
1012 
1013   PetscCall(ISIdentity(perm, &perm_identity));
1014   PetscCall(ISGetIndices(perm, &rip));
1015 
1016   /* special case that simply copies fill pattern */
1017   if (!levels && perm_identity) {
1018     PetscCall(PetscMalloc1(am + 1, &ui));
1019     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1020     B = fact;
1021     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1022 
1023     b  = (Mat_SeqSBAIJ *)B->data;
1024     uj = b->j;
1025     for (i = 0; i < am; i++) {
1026       aj = a->j + a->diag[i];
1027       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1028       b->ilen[i] = ui[i];
1029     }
1030     PetscCall(PetscFree(ui));
1031 
1032     B->factortype = MAT_FACTOR_NONE;
1033 
1034     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1035     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1036     B->factortype = MAT_FACTOR_ICC;
1037 
1038     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1039     PetscFunctionReturn(0);
1040   }
1041 
1042   /* initialization */
1043   PetscCall(PetscMalloc1(am + 1, &ui));
1044   ui[0] = 0;
1045   PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1046 
1047   /* jl: linked list for storing indices of the pivot rows
1048      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1049   PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1050   for (i = 0; i < am; i++) {
1051     jl[i] = am;
1052     il[i] = 0;
1053   }
1054 
1055   /* create and initialize a linked list for storing column indices of the active row k */
1056   nlnk = am + 1;
1057   PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1058 
1059   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1060   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1061 
1062   current_space = free_space;
1063 
1064   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1065   current_space_lvl = free_space_lvl;
1066 
1067   for (k = 0; k < am; k++) { /* for each active row k */
1068     /* initialize lnk by the column indices of row rip[k] of A */
1069     nzk         = 0;
1070     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1071     ncols_upper = 0;
1072     cols        = cols_lvl + am;
1073     for (j = 0; j < ncols; j++) {
1074       i = rip[*(aj + ai[rip[k]] + j)];
1075       if (i >= k) { /* only take upper triangular entry */
1076         cols[ncols_upper]     = i;
1077         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1078         ncols_upper++;
1079       }
1080     }
1081     PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1082     nzk += nlnk;
1083 
1084     /* update lnk by computing fill-in for each pivot row to be merged in */
1085     prow = jl[k]; /* 1st pivot row */
1086 
1087     while (prow < k) {
1088       nextprow = jl[prow];
1089 
1090       /* merge prow into k-th row */
1091       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1092       jmax  = ui[prow + 1];
1093       ncols = jmax - jmin;
1094       i     = jmin - ui[prow];
1095       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1096       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1097       PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1098       nzk += nlnk;
1099 
1100       /* update il and jl for prow */
1101       if (jmin < jmax) {
1102         il[prow] = jmin;
1103 
1104         j        = *cols;
1105         jl[prow] = jl[j];
1106         jl[j]    = prow;
1107       }
1108       prow = nextprow;
1109     }
1110 
1111     /* if free space is not available, make more free space */
1112     if (current_space->local_remaining < nzk) {
1113       i = am - k + 1;                                                             /* num of unfactored rows */
1114       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1115       PetscCall(PetscFreeSpaceGet(i, &current_space));
1116       PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
1117       reallocs++;
1118     }
1119 
1120     /* copy data into free_space and free_space_lvl, then initialize lnk */
1121     PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1122 
1123     /* add the k-th row into il and jl */
1124     if (nzk - 1 > 0) {
1125       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1126       jl[k] = jl[i];
1127       jl[i] = k;
1128       il[k] = ui[k] + 1;
1129     }
1130     uj_ptr[k]     = current_space->array;
1131     uj_lvl_ptr[k] = current_space_lvl->array;
1132 
1133     current_space->array += nzk;
1134     current_space->local_used += nzk;
1135     current_space->local_remaining -= nzk;
1136 
1137     current_space_lvl->array += nzk;
1138     current_space_lvl->local_used += nzk;
1139     current_space_lvl->local_remaining -= nzk;
1140 
1141     ui[k + 1] = ui[k] + nzk;
1142   }
1143 
1144   PetscCall(ISRestoreIndices(perm, &rip));
1145   PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1146   PetscCall(PetscFree(cols_lvl));
1147 
1148   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1149   PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1150   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1151   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1152   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1153 
1154   /* put together the new matrix in MATSEQSBAIJ format */
1155   B = fact;
1156   PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1157 
1158   b               = (Mat_SeqSBAIJ *)B->data;
1159   b->singlemalloc = PETSC_FALSE;
1160   b->free_a       = PETSC_TRUE;
1161   b->free_ij      = PETSC_TRUE;
1162 
1163   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1164 
1165   b->j             = uj;
1166   b->i             = ui;
1167   b->diag          = NULL;
1168   b->ilen          = NULL;
1169   b->imax          = NULL;
1170   b->row           = perm;
1171   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1172 
1173   PetscCall(PetscObjectReference((PetscObject)perm));
1174 
1175   b->icol = perm;
1176 
1177   PetscCall(PetscObjectReference((PetscObject)perm));
1178   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1179 
1180   b->maxnz = b->nz = ui[am];
1181 
1182   B->info.factor_mallocs   = reallocs;
1183   B->info.fill_ratio_given = fill;
1184   if (ai[am] != 0.) {
1185     /* nonzeros in lower triangular part of A (includign diagonals)= (ai[am]+am)/2 */
1186     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1187   } else {
1188     B->info.fill_ratio_needed = 0.0;
1189   }
1190 #if defined(PETSC_USE_INFO)
1191   if (ai[am] != 0) {
1192     PetscReal af = B->info.fill_ratio_needed;
1193     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1194     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1195     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1196   } else {
1197     PetscCall(PetscInfo(A, "Empty matrix\n"));
1198   }
1199 #endif
1200   if (perm_identity) {
1201     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1202     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1203     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1204   } else {
1205     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1206   }
1207   PetscFunctionReturn(0);
1208 }
1209 
1210 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) {
1211   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1212   Mat_SeqSBAIJ      *b;
1213   Mat                B;
1214   PetscBool          perm_identity, missing;
1215   PetscReal          fill = info->fill;
1216   const PetscInt    *rip;
1217   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1218   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1219   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1220   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1221   PetscBT            lnkbt;
1222 
1223   PetscFunctionBegin;
1224   if (bs > 1) { /* convert to seqsbaij */
1225     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1226     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1227 
1228     PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1229     PetscFunctionReturn(0);
1230   }
1231 
1232   PetscCall(MatMissingDiagonal(A, &missing, &i));
1233   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1234 
1235   /* check whether perm is the identity mapping */
1236   PetscCall(ISIdentity(perm, &perm_identity));
1237   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1238   PetscCall(ISGetIndices(perm, &rip));
1239 
1240   /* initialization */
1241   PetscCall(PetscMalloc1(mbs + 1, &ui));
1242   ui[0] = 0;
1243 
1244   /* jl: linked list for storing indices of the pivot rows
1245      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1246   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1247   for (i = 0; i < mbs; i++) {
1248     jl[i] = mbs;
1249     il[i] = 0;
1250   }
1251 
1252   /* create and initialize a linked list for storing column indices of the active row k */
1253   nlnk = mbs + 1;
1254   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1255 
1256   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1257   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1258 
1259   current_space = free_space;
1260 
1261   for (k = 0; k < mbs; k++) { /* for each active row k */
1262     /* initialize lnk by the column indices of row rip[k] of A */
1263     nzk         = 0;
1264     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1265     ncols_upper = 0;
1266     for (j = 0; j < ncols; j++) {
1267       i = rip[*(aj + ai[rip[k]] + j)];
1268       if (i >= k) { /* only take upper triangular entry */
1269         cols[ncols_upper] = i;
1270         ncols_upper++;
1271       }
1272     }
1273     PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1274     nzk += nlnk;
1275 
1276     /* update lnk by computing fill-in for each pivot row to be merged in */
1277     prow = jl[k]; /* 1st pivot row */
1278 
1279     while (prow < k) {
1280       nextprow = jl[prow];
1281       /* merge prow into k-th row */
1282       jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1283       jmax     = ui[prow + 1];
1284       ncols    = jmax - jmin;
1285       uj_ptr   = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1286       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1287       nzk += nlnk;
1288 
1289       /* update il and jl for prow */
1290       if (jmin < jmax) {
1291         il[prow] = jmin;
1292         j        = *uj_ptr;
1293         jl[prow] = jl[j];
1294         jl[j]    = prow;
1295       }
1296       prow = nextprow;
1297     }
1298 
1299     /* if free space is not available, make more free space */
1300     if (current_space->local_remaining < nzk) {
1301       i = mbs - k + 1;                                                            /* num of unfactored rows */
1302       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1303       PetscCall(PetscFreeSpaceGet(i, &current_space));
1304       reallocs++;
1305     }
1306 
1307     /* copy data into free space, then initialize lnk */
1308     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1309 
1310     /* add the k-th row into il and jl */
1311     if (nzk - 1 > 0) {
1312       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1313       jl[k] = jl[i];
1314       jl[i] = k;
1315       il[k] = ui[k] + 1;
1316     }
1317     ui_ptr[k] = current_space->array;
1318     current_space->array += nzk;
1319     current_space->local_used += nzk;
1320     current_space->local_remaining -= nzk;
1321 
1322     ui[k + 1] = ui[k] + nzk;
1323   }
1324 
1325   PetscCall(ISRestoreIndices(perm, &rip));
1326   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1327 
1328   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1329   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1330   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1331   PetscCall(PetscLLDestroy(lnk, lnkbt));
1332 
1333   /* put together the new matrix in MATSEQSBAIJ format */
1334   B = fact;
1335   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1336 
1337   b               = (Mat_SeqSBAIJ *)B->data;
1338   b->singlemalloc = PETSC_FALSE;
1339   b->free_a       = PETSC_TRUE;
1340   b->free_ij      = PETSC_TRUE;
1341 
1342   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1343 
1344   b->j             = uj;
1345   b->i             = ui;
1346   b->diag          = NULL;
1347   b->ilen          = NULL;
1348   b->imax          = NULL;
1349   b->row           = perm;
1350   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1351 
1352   PetscCall(PetscObjectReference((PetscObject)perm));
1353   b->icol = perm;
1354   PetscCall(PetscObjectReference((PetscObject)perm));
1355   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1356   b->maxnz = b->nz = ui[mbs];
1357 
1358   B->info.factor_mallocs   = reallocs;
1359   B->info.fill_ratio_given = fill;
1360   if (ai[mbs] != 0.) {
1361     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1362     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1363   } else {
1364     B->info.fill_ratio_needed = 0.0;
1365   }
1366 #if defined(PETSC_USE_INFO)
1367   if (ai[mbs] != 0.) {
1368     PetscReal af = B->info.fill_ratio_needed;
1369     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1370     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1371     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1372   } else {
1373     PetscCall(PetscInfo(A, "Empty matrix\n"));
1374   }
1375 #endif
1376   if (perm_identity) {
1377     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1378   } else {
1379     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1380   }
1381   PetscFunctionReturn(0);
1382 }
1383 
1384 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1385   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1386   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1387   PetscInt           i, k, n                       = a->mbs;
1388   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1389   const MatScalar   *aa = a->a, *v;
1390   PetscScalar       *x, *s, *t, *ls;
1391   const PetscScalar *b;
1392 
1393   PetscFunctionBegin;
1394   PetscCall(VecGetArrayRead(bb, &b));
1395   PetscCall(VecGetArray(xx, &x));
1396   t = a->solve_work;
1397 
1398   /* forward solve the lower triangular */
1399   PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1400 
1401   for (i = 1; i < n; i++) {
1402     v  = aa + bs2 * ai[i];
1403     vi = aj + ai[i];
1404     nz = ai[i + 1] - ai[i];
1405     s  = t + bs * i;
1406     PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1407     for (k = 0; k < nz; k++) {
1408       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1409       v += bs2;
1410     }
1411   }
1412 
1413   /* backward solve the upper triangular */
1414   ls = a->solve_work + A->cmap->n;
1415   for (i = n - 1; i >= 0; i--) {
1416     v  = aa + bs2 * (adiag[i + 1] + 1);
1417     vi = aj + adiag[i + 1] + 1;
1418     nz = adiag[i] - adiag[i + 1] - 1;
1419     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1420     for (k = 0; k < nz; k++) {
1421       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1422       v += bs2;
1423     }
1424     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1425     PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1426   }
1427 
1428   PetscCall(VecRestoreArrayRead(bb, &b));
1429   PetscCall(VecRestoreArray(xx, &x));
1430   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1431   PetscFunctionReturn(0);
1432 }
1433 
1434 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx) {
1435   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1436   IS                 iscol = a->col, isrow = a->row;
1437   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1438   PetscInt           i, m, n = a->mbs;
1439   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1440   const MatScalar   *aa = a->a, *v;
1441   PetscScalar       *x, *s, *t, *ls;
1442   const PetscScalar *b;
1443 
1444   PetscFunctionBegin;
1445   PetscCall(VecGetArrayRead(bb, &b));
1446   PetscCall(VecGetArray(xx, &x));
1447   t = a->solve_work;
1448 
1449   PetscCall(ISGetIndices(isrow, &rout));
1450   r = rout;
1451   PetscCall(ISGetIndices(iscol, &cout));
1452   c = cout;
1453 
1454   /* forward solve the lower triangular */
1455   PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1456   for (i = 1; i < n; i++) {
1457     v  = aa + bs2 * ai[i];
1458     vi = aj + ai[i];
1459     nz = ai[i + 1] - ai[i];
1460     s  = t + bs * i;
1461     PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1462     for (m = 0; m < nz; m++) {
1463       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1464       v += bs2;
1465     }
1466   }
1467 
1468   /* backward solve the upper triangular */
1469   ls = a->solve_work + A->cmap->n;
1470   for (i = n - 1; i >= 0; i--) {
1471     v  = aa + bs2 * (adiag[i + 1] + 1);
1472     vi = aj + adiag[i + 1] + 1;
1473     nz = adiag[i] - adiag[i + 1] - 1;
1474     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1475     for (m = 0; m < nz; m++) {
1476       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1477       v += bs2;
1478     }
1479     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1480     PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1481   }
1482   PetscCall(ISRestoreIndices(isrow, &rout));
1483   PetscCall(ISRestoreIndices(iscol, &cout));
1484   PetscCall(VecRestoreArrayRead(bb, &b));
1485   PetscCall(VecRestoreArray(xx, &x));
1486   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1487   PetscFunctionReturn(0);
1488 }
1489 
1490 /*
1491     For each block in an block array saves the largest absolute value in the block into another array
1492 */
1493 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray) {
1494   PetscInt i, j;
1495 
1496   PetscFunctionBegin;
1497   PetscCall(PetscArrayzero(absarray, nbs + 1));
1498   for (i = 0; i < nbs; i++) {
1499     for (j = 0; j < bs2; j++) {
1500       if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]);
1501     }
1502   }
1503   PetscFunctionReturn(0);
1504 }
1505 
1506 /*
1507      This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1508 */
1509 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact) {
1510   Mat             B = *fact;
1511   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b;
1512   IS              isicol;
1513   const PetscInt *r, *ic;
1514   PetscInt        i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag;
1515   PetscInt       *bi, *bj, *bdiag;
1516 
1517   PetscInt   row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au;
1518   PetscInt   nlnk, *lnk;
1519   PetscBT    lnkbt;
1520   PetscBool  row_identity, icol_identity;
1521   MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp;
1522   PetscInt   j, nz, *pj, *bjtmp, k, ncut, *jtmp;
1523 
1524   PetscReal  dt = info->dt; /* shift=info->shiftamount; */
1525   PetscInt   nnz_max;
1526   PetscBool  missing;
1527   PetscReal *vtmp_abs;
1528   MatScalar *v_work;
1529   PetscInt  *v_pivots;
1530   PetscBool  allowzeropivot, zeropivotdetected = PETSC_FALSE;
1531 
1532   PetscFunctionBegin;
1533   /* ------- symbolic factorization, can be reused ---------*/
1534   PetscCall(MatMissingDiagonal(A, &missing, &i));
1535   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1536   adiag = a->diag;
1537 
1538   PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
1539 
1540   /* bdiag is location of diagonal in factor */
1541   PetscCall(PetscMalloc1(mbs + 1, &bdiag));
1542 
1543   /* allocate row pointers bi */
1544   PetscCall(PetscMalloc1(2 * mbs + 2, &bi));
1545 
1546   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1547   dtcount = (PetscInt)info->dtcount;
1548   if (dtcount > mbs - 1) dtcount = mbs - 1;
1549   nnz_max = ai[mbs] + 2 * mbs * dtcount + 2;
1550   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1551   PetscCall(PetscMalloc1(nnz_max, &bj));
1552   nnz_max = nnz_max * bs2;
1553   PetscCall(PetscMalloc1(nnz_max, &ba));
1554 
1555   /* put together the new matrix */
1556   PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1557 
1558   b               = (Mat_SeqBAIJ *)(B)->data;
1559   b->free_a       = PETSC_TRUE;
1560   b->free_ij      = PETSC_TRUE;
1561   b->singlemalloc = PETSC_FALSE;
1562 
1563   b->a    = ba;
1564   b->j    = bj;
1565   b->i    = bi;
1566   b->diag = bdiag;
1567   b->ilen = NULL;
1568   b->imax = NULL;
1569   b->row  = isrow;
1570   b->col  = iscol;
1571 
1572   PetscCall(PetscObjectReference((PetscObject)isrow));
1573   PetscCall(PetscObjectReference((PetscObject)iscol));
1574 
1575   b->icol = isicol;
1576   PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work));
1577   b->maxnz = nnz_max / bs2;
1578 
1579   (B)->factortype            = MAT_FACTOR_ILUDT;
1580   (B)->info.factor_mallocs   = 0;
1581   (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2));
1582   /* ------- end of symbolic factorization ---------*/
1583   PetscCall(ISGetIndices(isrow, &r));
1584   PetscCall(ISGetIndices(isicol, &ic));
1585 
1586   /* linked list for storing column indices of the active row */
1587   nlnk = mbs + 1;
1588   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1589 
1590   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1591   PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp));
1592   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1593   PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp));
1594   PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs));
1595   PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots));
1596 
1597   allowzeropivot  = PetscNot(A->erroriffailure);
1598   bi[0]           = 0;
1599   bdiag[0]        = (nnz_max / bs2) - 1; /* location of diagonal in factor B */
1600   bi[2 * mbs + 1] = bdiag[0] + 1;        /* endof bj and ba array */
1601   for (i = 0; i < mbs; i++) {
1602     /* copy initial fill into linked list */
1603     nzi = ai[r[i] + 1] - ai[r[i]];
1604     PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
1605     nzi_al = adiag[r[i]] - ai[r[i]];
1606     nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1;
1607 
1608     /* load in initial unfactored row */
1609     ajtmp = aj + ai[r[i]];
1610     PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt));
1611     PetscCall(PetscArrayzero(rtmp, mbs * bs2));
1612     aatmp = a->a + bs2 * ai[r[i]];
1613     for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2));
1614 
1615     /* add pivot rows into linked list */
1616     row = lnk[mbs];
1617     while (row < i) {
1618       nzi_bl = bi[row + 1] - bi[row] + 1;
1619       bjtmp  = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */
1620       PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im));
1621       nzi += nlnk;
1622       row = lnk[row];
1623     }
1624 
1625     /* copy data from lnk into jtmp, then initialize lnk */
1626     PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt));
1627 
1628     /* numerical factorization */
1629     bjtmp = jtmp;
1630     row   = *bjtmp++; /* 1st pivot row */
1631 
1632     while (row < i) {
1633       pc = rtmp + bs2 * row;
1634       pv = ba + bs2 * bdiag[row];                           /* inv(diag) of the pivot row */
1635       PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1636       PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs));
1637       if (vtmp_abs[0] > dt) {         /* apply tolerance dropping rule */
1638         pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */
1639         pv = ba + bs2 * (bdiag[row + 1] + 1);
1640         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */
1641         for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
1642         /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */
1643       }
1644       row = *bjtmp++;
1645     }
1646 
1647     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1648     nzi_bl = 0;
1649     j      = 0;
1650     while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */
1651       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1652       nzi_bl++;
1653       j++;
1654     }
1655     nzi_bu = nzi - nzi_bl - 1;
1656 
1657     while (j < nzi) { /* U-part */
1658       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1659       j++;
1660     }
1661 
1662     PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs));
1663 
1664     bjtmp = bj + bi[i];
1665     batmp = ba + bs2 * bi[i];
1666     /* apply level dropping rule to L part */
1667     ncut  = nzi_al + dtcount;
1668     if (ncut < nzi_bl) {
1669       PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp));
1670       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp));
1671     } else {
1672       ncut = nzi_bl;
1673     }
1674     for (j = 0; j < ncut; j++) {
1675       bjtmp[j] = jtmp[j];
1676       PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2));
1677     }
1678     bi[i + 1] = bi[i] + ncut;
1679     nzi       = ncut + 1;
1680 
1681     /* apply level dropping rule to U part */
1682     ncut = nzi_au + dtcount;
1683     if (ncut < nzi_bu) {
1684       PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1));
1685       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1));
1686     } else {
1687       ncut = nzi_bu;
1688     }
1689     nzi += ncut;
1690 
1691     /* mark bdiagonal */
1692     bdiag[i + 1]    = bdiag[i] - (ncut + 1);
1693     bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1);
1694 
1695     bjtmp = bj + bdiag[i];
1696     batmp = ba + bs2 * bdiag[i];
1697     PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2));
1698     *bjtmp = i;
1699 
1700     bjtmp = bj + bdiag[i + 1] + 1;
1701     batmp = ba + (bdiag[i + 1] + 1) * bs2;
1702 
1703     for (k = 0; k < ncut; k++) {
1704       bjtmp[k] = jtmp[nzi_bl + 1 + k];
1705       PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2));
1706     }
1707 
1708     im[i] = nzi; /* used by PetscLLAddSortedLU() */
1709 
1710     /* invert diagonal block for simpler triangular solves - add shift??? */
1711     batmp = ba + bs2 * bdiag[i];
1712 
1713     PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
1714     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1715   } /* for (i=0; i<mbs; i++) */
1716   PetscCall(PetscFree3(v_work, multiplier, v_pivots));
1717 
1718   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1719   PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]);
1720 
1721   PetscCall(ISRestoreIndices(isrow, &r));
1722   PetscCall(ISRestoreIndices(isicol, &ic));
1723 
1724   PetscCall(PetscLLDestroy(lnk, lnkbt));
1725 
1726   PetscCall(PetscFree2(im, jtmp));
1727   PetscCall(PetscFree2(rtmp, vtmp));
1728 
1729   PetscCall(PetscLogFlops(bs2 * B->cmap->n));
1730   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1731 
1732   PetscCall(ISIdentity(isrow, &row_identity));
1733   PetscCall(ISIdentity(isicol, &icol_identity));
1734   if (row_identity && icol_identity) {
1735     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1736   } else {
1737     B->ops->solve = MatSolve_SeqBAIJ_N;
1738   }
1739 
1740   B->ops->solveadd          = NULL;
1741   B->ops->solvetranspose    = NULL;
1742   B->ops->solvetransposeadd = NULL;
1743   B->ops->matsolve          = NULL;
1744   B->assembled              = PETSC_TRUE;
1745   B->preallocated           = PETSC_TRUE;
1746   PetscFunctionReturn(0);
1747 }
1748