xref: /petsc/src/mat/impls/baij/seq/baijfact.c (revision ae51ba108530e1a8d2aeb8bc3771a24c2fcd801e)
1 
2 /*
3     Factorization code for BAIJ format.
4 */
5 #include <../src/mat/impls/baij/seq/baij.h>
6 #include <petsc/private/kernels/blockinvert.h>
7 
8 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2(Mat B, Mat A, const MatFactorInfo *info)
9 {
10   Mat             C = B;
11   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
12   IS              isrow = b->row, isicol = b->icol;
13   const PetscInt *r, *ic;
14   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
15   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
16   MatScalar      *rtmp, *pc, *mwork, *pv;
17   MatScalar      *aa             = a->a, *v;
18   PetscReal       shift          = info->shiftamount;
19   const PetscBool allowzeropivot = PetscNot(A->erroriffailure);
20 
21   PetscFunctionBegin;
22   PetscCall(ISGetIndices(isrow, &r));
23   PetscCall(ISGetIndices(isicol, &ic));
24 
25   /* generate work space needed by the factorization */
26   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
27   PetscCall(PetscArrayzero(rtmp, bs2 * n));
28 
29   for (PetscInt i = 0; i < n; i++) {
30     /* zero rtmp */
31     /* L part */
32     PetscInt *pj;
33     PetscInt  nzL, nz = bi[i + 1] - bi[i];
34     bjtmp = bj + bi[i];
35     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
36 
37     /* U part */
38     nz    = bdiag[i] - bdiag[i + 1];
39     bjtmp = bj + bdiag[i + 1] + 1;
40     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
41 
42     /* load in initial (unfactored row) */
43     nz    = ai[r[i] + 1] - ai[r[i]];
44     ajtmp = aj + ai[r[i]];
45     v     = aa + bs2 * ai[r[i]];
46     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], v + bs2 * j, bs2));
47 
48     /* elimination */
49     bjtmp = bj + bi[i];
50     nzL   = bi[i + 1] - bi[i];
51     for (PetscInt k = 0; k < nzL; k++) {
52       PetscBool flg = PETSC_FALSE;
53       PetscInt  row = bjtmp[k];
54 
55       pc = rtmp + bs2 * row;
56       for (PetscInt j = 0; j < bs2; j++) {
57         if (pc[j] != (PetscScalar)0.0) {
58           flg = PETSC_TRUE;
59           break;
60         }
61       }
62       if (flg) {
63         pv = b->a + bs2 * bdiag[row];
64         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
65         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
66 
67         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
68         pv = b->a + bs2 * (bdiag[row + 1] + 1);
69         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries inU(row,:), excluding diag */
70         for (PetscInt j = 0; j < nz; j++) {
71           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
72           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
73           v = rtmp + 4 * pj[j];
74           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
75           pv += 4;
76         }
77         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
78       }
79     }
80 
81     /* finished row so stick it into b->a */
82     /* L part */
83     pv = b->a + bs2 * bi[i];
84     pj = b->j + bi[i];
85     nz = bi[i + 1] - bi[i];
86     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
87 
88     /* Mark diagonal and invert diagonal for simpler triangular solves */
89     pv = b->a + bs2 * bdiag[i];
90     pj = b->j + bdiag[i];
91     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
92     {
93       PetscBool zeropivotdetected;
94 
95       PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
96       if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
97     }
98 
99     /* U part */
100     pv = b->a + bs2 * (bdiag[i + 1] + 1);
101     pj = b->j + bdiag[i + 1] + 1;
102     nz = bdiag[i] - bdiag[i + 1] - 1;
103     for (PetscInt j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
104   }
105 
106   PetscCall(PetscFree2(rtmp, mwork));
107   PetscCall(ISRestoreIndices(isicol, &ic));
108   PetscCall(ISRestoreIndices(isrow, &r));
109 
110   C->ops->solve          = MatSolve_SeqBAIJ_2;
111   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2;
112   C->assembled           = PETSC_TRUE;
113 
114   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
115   PetscFunctionReturn(PETSC_SUCCESS);
116 }
117 
118 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering(Mat B, Mat A, const MatFactorInfo *info)
119 {
120   Mat             C = B;
121   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
122   PetscInt        i, j, k, nz, nzL, row, *pj;
123   const PetscInt  n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, bs2 = a->bs2;
124   const PetscInt *ajtmp, *bjtmp, *bdiag = b->diag;
125   MatScalar      *rtmp, *pc, *mwork, *pv;
126   MatScalar      *aa = a->a, *v;
127   PetscInt        flg;
128   PetscReal       shift = info->shiftamount;
129   PetscBool       allowzeropivot, zeropivotdetected;
130 
131   PetscFunctionBegin;
132   allowzeropivot = PetscNot(A->erroriffailure);
133 
134   /* generate work space needed by the factorization */
135   PetscCall(PetscMalloc2(bs2 * n, &rtmp, bs2, &mwork));
136   PetscCall(PetscArrayzero(rtmp, bs2 * n));
137 
138   for (i = 0; i < n; i++) {
139     /* zero rtmp */
140     /* L part */
141     nz    = bi[i + 1] - bi[i];
142     bjtmp = bj + bi[i];
143     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
144 
145     /* U part */
146     nz    = bdiag[i] - bdiag[i + 1];
147     bjtmp = bj + bdiag[i + 1] + 1;
148     for (j = 0; j < nz; j++) PetscCall(PetscArrayzero(rtmp + bs2 * bjtmp[j], bs2));
149 
150     /* load in initial (unfactored row) */
151     nz    = ai[i + 1] - ai[i];
152     ajtmp = aj + ai[i];
153     v     = aa + bs2 * ai[i];
154     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ajtmp[j], v + bs2 * j, bs2));
155 
156     /* elimination */
157     bjtmp = bj + bi[i];
158     nzL   = bi[i + 1] - bi[i];
159     for (k = 0; k < nzL; k++) {
160       row = bjtmp[k];
161       pc  = rtmp + bs2 * row;
162       for (flg = 0, j = 0; j < bs2; j++) {
163         if (pc[j] != (PetscScalar)0.0) {
164           flg = 1;
165           break;
166         }
167       }
168       if (flg) {
169         pv = b->a + bs2 * bdiag[row];
170         /* PetscKernel_A_gets_A_times_B(bs,pc,pv,mwork); *pc = *pc * (*pv); */
171         PetscCall(PetscKernel_A_gets_A_times_B_2(pc, pv, mwork));
172 
173         pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
174         pv = b->a + bs2 * (bdiag[row + 1] + 1);
175         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
176         for (j = 0; j < nz; j++) {
177           /* PetscKernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); */
178           /* rtmp+bs2*pj[j] = rtmp+bs2*pj[j] - (*pc)*(pv+bs2*j) */
179           v = rtmp + 4 * pj[j];
180           PetscCall(PetscKernel_A_gets_A_minus_B_times_C_2(v, pc, pv));
181           pv += 4;
182         }
183         PetscCall(PetscLogFlops(16.0 * nz + 12)); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
184       }
185     }
186 
187     /* finished row so stick it into b->a */
188     /* L part */
189     pv = b->a + bs2 * bi[i];
190     pj = b->j + bi[i];
191     nz = bi[i + 1] - bi[i];
192     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
193 
194     /* Mark diagonal and invert diagonal for simpler triangular solves */
195     pv = b->a + bs2 * bdiag[i];
196     pj = b->j + bdiag[i];
197     PetscCall(PetscArraycpy(pv, rtmp + bs2 * pj[0], bs2));
198     PetscCall(PetscKernel_A_gets_inverse_A_2(pv, shift, allowzeropivot, &zeropivotdetected));
199     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
200 
201     /* U part */
202     /*
203     pv = b->a + bs2*bi[2*n-i];
204     pj = b->j + bi[2*n-i];
205     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
206     */
207     pv = b->a + bs2 * (bdiag[i + 1] + 1);
208     pj = b->j + bdiag[i + 1] + 1;
209     nz = bdiag[i] - bdiag[i + 1] - 1;
210     for (j = 0; j < nz; j++) PetscCall(PetscArraycpy(pv + bs2 * j, rtmp + bs2 * pj[j], bs2));
211   }
212   PetscCall(PetscFree2(rtmp, mwork));
213 
214   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering;
215   C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_2_NaturalOrdering;
216   C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_2_NaturalOrdering;
217   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
218   C->assembled           = PETSC_TRUE;
219 
220   PetscCall(PetscLogFlops(1.333333333333 * 2 * 2 * 2 * n)); /* from inverting diagonal blocks */
221   PetscFunctionReturn(PETSC_SUCCESS);
222 }
223 
224 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_inplace(Mat B, Mat A, const MatFactorInfo *info)
225 {
226   Mat             C = B;
227   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
228   IS              isrow = b->row, isicol = b->icol;
229   const PetscInt *r, *ic;
230   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
231   PetscInt       *ajtmpold, *ajtmp, nz, row;
232   PetscInt       *diag_offset = b->diag, idx, *ai = a->i, *aj = a->j, *pj;
233   MatScalar      *pv, *v, *rtmp, m1, m2, m3, m4, *pc, *w, *x, x1, x2, x3, x4;
234   MatScalar       p1, p2, p3, p4;
235   MatScalar      *ba = b->a, *aa = a->a;
236   PetscReal       shift = info->shiftamount;
237   PetscBool       allowzeropivot, zeropivotdetected;
238 
239   PetscFunctionBegin;
240   allowzeropivot = PetscNot(A->erroriffailure);
241   PetscCall(ISGetIndices(isrow, &r));
242   PetscCall(ISGetIndices(isicol, &ic));
243   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
244 
245   for (i = 0; i < n; i++) {
246     nz    = bi[i + 1] - bi[i];
247     ajtmp = bj + bi[i];
248     for (j = 0; j < nz; j++) {
249       x    = rtmp + 4 * ajtmp[j];
250       x[0] = x[1] = x[2] = x[3] = 0.0;
251     }
252     /* load in initial (unfactored row) */
253     idx      = r[i];
254     nz       = ai[idx + 1] - ai[idx];
255     ajtmpold = aj + ai[idx];
256     v        = aa + 4 * ai[idx];
257     for (j = 0; j < nz; j++) {
258       x    = rtmp + 4 * ic[ajtmpold[j]];
259       x[0] = v[0];
260       x[1] = v[1];
261       x[2] = v[2];
262       x[3] = v[3];
263       v += 4;
264     }
265     row = *ajtmp++;
266     while (row < i) {
267       pc = rtmp + 4 * row;
268       p1 = pc[0];
269       p2 = pc[1];
270       p3 = pc[2];
271       p4 = pc[3];
272       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
273         pv    = ba + 4 * diag_offset[row];
274         pj    = bj + diag_offset[row] + 1;
275         x1    = pv[0];
276         x2    = pv[1];
277         x3    = pv[2];
278         x4    = pv[3];
279         pc[0] = m1 = p1 * x1 + p3 * x2;
280         pc[1] = m2 = p2 * x1 + p4 * x2;
281         pc[2] = m3 = p1 * x3 + p3 * x4;
282         pc[3] = m4 = p2 * x3 + p4 * x4;
283         nz         = bi[row + 1] - diag_offset[row] - 1;
284         pv += 4;
285         for (j = 0; j < nz; j++) {
286           x1 = pv[0];
287           x2 = pv[1];
288           x3 = pv[2];
289           x4 = pv[3];
290           x  = rtmp + 4 * pj[j];
291           x[0] -= m1 * x1 + m3 * x2;
292           x[1] -= m2 * x1 + m4 * x2;
293           x[2] -= m1 * x3 + m3 * x4;
294           x[3] -= m2 * x3 + m4 * x4;
295           pv += 4;
296         }
297         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
298       }
299       row = *ajtmp++;
300     }
301     /* finished row so stick it into b->a */
302     pv = ba + 4 * bi[i];
303     pj = bj + bi[i];
304     nz = bi[i + 1] - bi[i];
305     for (j = 0; j < nz; j++) {
306       x     = rtmp + 4 * pj[j];
307       pv[0] = x[0];
308       pv[1] = x[1];
309       pv[2] = x[2];
310       pv[3] = x[3];
311       pv += 4;
312     }
313     /* invert diagonal block */
314     w = ba + 4 * diag_offset[i];
315     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
316     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
317   }
318 
319   PetscCall(PetscFree(rtmp));
320   PetscCall(ISRestoreIndices(isicol, &ic));
321   PetscCall(ISRestoreIndices(isrow, &r));
322 
323   C->ops->solve          = MatSolve_SeqBAIJ_2_inplace;
324   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_inplace;
325   C->assembled           = PETSC_TRUE;
326 
327   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
328   PetscFunctionReturn(PETSC_SUCCESS);
329 }
330 /*
331       Version for when blocks are 2 by 2 Using natural ordering
332 */
333 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering_inplace(Mat C, Mat A, const MatFactorInfo *info)
334 {
335   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
336   PetscInt     i, j, n = a->mbs, *bi = b->i, *bj = b->j;
337   PetscInt    *ajtmpold, *ajtmp, nz, row;
338   PetscInt    *diag_offset = b->diag, *ai = a->i, *aj = a->j, *pj;
339   MatScalar   *pv, *v, *rtmp, *pc, *w, *x;
340   MatScalar    p1, p2, p3, p4, m1, m2, m3, m4, x1, x2, x3, x4;
341   MatScalar   *ba = b->a, *aa = a->a;
342   PetscReal    shift = info->shiftamount;
343   PetscBool    allowzeropivot, zeropivotdetected;
344 
345   PetscFunctionBegin;
346   allowzeropivot = PetscNot(A->erroriffailure);
347   PetscCall(PetscMalloc1(4 * (n + 1), &rtmp));
348   for (i = 0; i < n; i++) {
349     nz    = bi[i + 1] - bi[i];
350     ajtmp = bj + bi[i];
351     for (j = 0; j < nz; j++) {
352       x    = rtmp + 4 * ajtmp[j];
353       x[0] = x[1] = x[2] = x[3] = 0.0;
354     }
355     /* load in initial (unfactored row) */
356     nz       = ai[i + 1] - ai[i];
357     ajtmpold = aj + ai[i];
358     v        = aa + 4 * ai[i];
359     for (j = 0; j < nz; j++) {
360       x    = rtmp + 4 * ajtmpold[j];
361       x[0] = v[0];
362       x[1] = v[1];
363       x[2] = v[2];
364       x[3] = v[3];
365       v += 4;
366     }
367     row = *ajtmp++;
368     while (row < i) {
369       pc = rtmp + 4 * row;
370       p1 = pc[0];
371       p2 = pc[1];
372       p3 = pc[2];
373       p4 = pc[3];
374       if (p1 != (PetscScalar)0.0 || p2 != (PetscScalar)0.0 || p3 != (PetscScalar)0.0 || p4 != (PetscScalar)0.0) {
375         pv    = ba + 4 * diag_offset[row];
376         pj    = bj + diag_offset[row] + 1;
377         x1    = pv[0];
378         x2    = pv[1];
379         x3    = pv[2];
380         x4    = pv[3];
381         pc[0] = m1 = p1 * x1 + p3 * x2;
382         pc[1] = m2 = p2 * x1 + p4 * x2;
383         pc[2] = m3 = p1 * x3 + p3 * x4;
384         pc[3] = m4 = p2 * x3 + p4 * x4;
385         nz         = bi[row + 1] - diag_offset[row] - 1;
386         pv += 4;
387         for (j = 0; j < nz; j++) {
388           x1 = pv[0];
389           x2 = pv[1];
390           x3 = pv[2];
391           x4 = pv[3];
392           x  = rtmp + 4 * pj[j];
393           x[0] -= m1 * x1 + m3 * x2;
394           x[1] -= m2 * x1 + m4 * x2;
395           x[2] -= m1 * x3 + m3 * x4;
396           x[3] -= m2 * x3 + m4 * x4;
397           pv += 4;
398         }
399         PetscCall(PetscLogFlops(16.0 * nz + 12.0));
400       }
401       row = *ajtmp++;
402     }
403     /* finished row so stick it into b->a */
404     pv = ba + 4 * bi[i];
405     pj = bj + bi[i];
406     nz = bi[i + 1] - bi[i];
407     for (j = 0; j < nz; j++) {
408       x     = rtmp + 4 * pj[j];
409       pv[0] = x[0];
410       pv[1] = x[1];
411       pv[2] = x[2];
412       pv[3] = x[3];
413       /*
414       printf(" col %d:",pj[j]);
415       PetscInt j1;
416       for (j1=0; j1<4; j1++) printf(" %g,",*(pv+j1));
417       printf("\n");
418       */
419       pv += 4;
420     }
421     /* invert diagonal block */
422     w = ba + 4 * diag_offset[i];
423     PetscCall(PetscKernel_A_gets_inverse_A_2(w, shift, allowzeropivot, &zeropivotdetected));
424     if (zeropivotdetected) C->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
425   }
426 
427   PetscCall(PetscFree(rtmp));
428 
429   C->ops->solve          = MatSolve_SeqBAIJ_2_NaturalOrdering_inplace;
430   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace;
431   C->assembled           = PETSC_TRUE;
432 
433   PetscCall(PetscLogFlops(1.333333333333 * 8 * b->mbs)); /* from inverting diagonal blocks */
434   PetscFunctionReturn(PETSC_SUCCESS);
435 }
436 
437 /* ----------------------------------------------------------- */
438 /*
439      Version for when blocks are 1 by 1.
440 */
441 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1(Mat B, Mat A, const MatFactorInfo *info)
442 {
443   Mat              C = B;
444   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
445   IS               isrow = b->row, isicol = b->icol;
446   const PetscInt  *r, *ic, *ics;
447   const PetscInt   n = a->mbs, *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j, *bdiag = b->diag;
448   PetscInt         i, j, k, nz, nzL, row, *pj;
449   const PetscInt  *ajtmp, *bjtmp;
450   MatScalar       *rtmp, *pc, multiplier, *pv;
451   const MatScalar *aa = a->a, *v;
452   PetscBool        row_identity, col_identity;
453   FactorShiftCtx   sctx;
454   const PetscInt  *ddiag;
455   PetscReal        rs;
456   MatScalar        d;
457 
458   PetscFunctionBegin;
459   /* MatPivotSetUp(): initialize shift context sctx */
460   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
461 
462   if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) { /* set sctx.shift_top=max{rs} */
463     ddiag          = a->diag;
464     sctx.shift_top = info->zeropivot;
465     for (i = 0; i < n; i++) {
466       /* calculate sum(|aij|)-RealPart(aii), amt of shift needed for this row */
467       d  = (aa)[ddiag[i]];
468       rs = -PetscAbsScalar(d) - PetscRealPart(d);
469       v  = aa + ai[i];
470       nz = ai[i + 1] - ai[i];
471       for (j = 0; j < nz; j++) rs += PetscAbsScalar(v[j]);
472       if (rs > sctx.shift_top) sctx.shift_top = rs;
473     }
474     sctx.shift_top *= 1.1;
475     sctx.nshift_max = 5;
476     sctx.shift_lo   = 0.;
477     sctx.shift_hi   = 1.;
478   }
479 
480   PetscCall(ISGetIndices(isrow, &r));
481   PetscCall(ISGetIndices(isicol, &ic));
482   PetscCall(PetscMalloc1(n + 1, &rtmp));
483   ics = ic;
484 
485   do {
486     sctx.newshift = PETSC_FALSE;
487     for (i = 0; i < n; i++) {
488       /* zero rtmp */
489       /* L part */
490       nz    = bi[i + 1] - bi[i];
491       bjtmp = bj + bi[i];
492       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
493 
494       /* U part */
495       nz    = bdiag[i] - bdiag[i + 1];
496       bjtmp = bj + bdiag[i + 1] + 1;
497       for (j = 0; j < nz; j++) rtmp[bjtmp[j]] = 0.0;
498 
499       /* load in initial (unfactored row) */
500       nz    = ai[r[i] + 1] - ai[r[i]];
501       ajtmp = aj + ai[r[i]];
502       v     = aa + ai[r[i]];
503       for (j = 0; j < nz; j++) rtmp[ics[ajtmp[j]]] = v[j];
504 
505       /* ZeropivotApply() */
506       rtmp[i] += sctx.shift_amount; /* shift the diagonal of the matrix */
507 
508       /* elimination */
509       bjtmp = bj + bi[i];
510       row   = *bjtmp++;
511       nzL   = bi[i + 1] - bi[i];
512       for (k = 0; k < nzL; k++) {
513         pc = rtmp + row;
514         if (*pc != (PetscScalar)0.0) {
515           pv         = b->a + bdiag[row];
516           multiplier = *pc * (*pv);
517           *pc        = multiplier;
518 
519           pj = b->j + bdiag[row + 1] + 1; /* beginning of U(row,:) */
520           pv = b->a + bdiag[row + 1] + 1;
521           nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:) excluding diag */
522           for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
523           PetscCall(PetscLogFlops(2.0 * nz));
524         }
525         row = *bjtmp++;
526       }
527 
528       /* finished row so stick it into b->a */
529       rs = 0.0;
530       /* L part */
531       pv = b->a + bi[i];
532       pj = b->j + bi[i];
533       nz = bi[i + 1] - bi[i];
534       for (j = 0; j < nz; j++) {
535         pv[j] = rtmp[pj[j]];
536         rs += PetscAbsScalar(pv[j]);
537       }
538 
539       /* U part */
540       pv = b->a + bdiag[i + 1] + 1;
541       pj = b->j + bdiag[i + 1] + 1;
542       nz = bdiag[i] - bdiag[i + 1] - 1;
543       for (j = 0; j < nz; j++) {
544         pv[j] = rtmp[pj[j]];
545         rs += PetscAbsScalar(pv[j]);
546       }
547 
548       sctx.rs = rs;
549       sctx.pv = rtmp[i];
550       PetscCall(MatPivotCheck(B, A, info, &sctx, i));
551       if (sctx.newshift) break; /* break for-loop */
552       rtmp[i] = sctx.pv;        /* sctx.pv might be updated in the case of MAT_SHIFT_INBLOCKS */
553 
554       /* Mark diagonal and invert diagonal for simpler triangular solves */
555       pv  = b->a + bdiag[i];
556       *pv = (PetscScalar)1.0 / rtmp[i];
557 
558     } /* endof for (i=0; i<n; i++) { */
559 
560     /* MatPivotRefine() */
561     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE && !sctx.newshift && sctx.shift_fraction > 0 && sctx.nshift < sctx.nshift_max) {
562       /*
563        * if no shift in this attempt & shifting & started shifting & can refine,
564        * then try lower shift
565        */
566       sctx.shift_hi       = sctx.shift_fraction;
567       sctx.shift_fraction = (sctx.shift_hi + sctx.shift_lo) / 2.;
568       sctx.shift_amount   = sctx.shift_fraction * sctx.shift_top;
569       sctx.newshift       = PETSC_TRUE;
570       sctx.nshift++;
571     }
572   } while (sctx.newshift);
573 
574   PetscCall(PetscFree(rtmp));
575   PetscCall(ISRestoreIndices(isicol, &ic));
576   PetscCall(ISRestoreIndices(isrow, &r));
577 
578   PetscCall(ISIdentity(isrow, &row_identity));
579   PetscCall(ISIdentity(isicol, &col_identity));
580   if (row_identity && col_identity) {
581     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering;
582     C->ops->forwardsolve   = MatForwardSolve_SeqBAIJ_1_NaturalOrdering;
583     C->ops->backwardsolve  = MatBackwardSolve_SeqBAIJ_1_NaturalOrdering;
584     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
585   } else {
586     C->ops->solve          = MatSolve_SeqBAIJ_1;
587     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1;
588   }
589   C->assembled = PETSC_TRUE;
590   PetscCall(PetscLogFlops(C->cmap->n));
591 
592   /* MatShiftView(A,info,&sctx) */
593   if (sctx.nshift) {
594     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
595       PetscCall(PetscInfo(A, "number of shift_pd tries %" PetscInt_FMT ", shift_amount %g, diagonal shifted up by %e fraction top_value %e\n", sctx.nshift, (double)sctx.shift_amount, (double)sctx.shift_fraction, (double)sctx.shift_top));
596     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
597       PetscCall(PetscInfo(A, "number of shift_nz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
598     } else if (info->shifttype == (PetscReal)MAT_SHIFT_INBLOCKS) {
599       PetscCall(PetscInfo(A, "number of shift_inblocks applied %" PetscInt_FMT ", each shift_amount %g\n", sctx.nshift, (double)info->shiftamount));
600     }
601   }
602   PetscFunctionReturn(PETSC_SUCCESS);
603 }
604 
605 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_1_inplace(Mat C, Mat A, const MatFactorInfo *info)
606 {
607   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b = (Mat_SeqBAIJ *)C->data;
608   IS              isrow = b->row, isicol = b->icol;
609   const PetscInt *r, *ic;
610   PetscInt        i, j, n = a->mbs, *bi = b->i, *bj = b->j;
611   PetscInt       *ajtmpold, *ajtmp, nz, row, *ai = a->i, *aj = a->j;
612   PetscInt       *diag_offset = b->diag, diag, *pj;
613   MatScalar      *pv, *v, *rtmp, multiplier, *pc;
614   MatScalar      *ba = b->a, *aa = a->a;
615   PetscBool       row_identity, col_identity;
616 
617   PetscFunctionBegin;
618   PetscCall(ISGetIndices(isrow, &r));
619   PetscCall(ISGetIndices(isicol, &ic));
620   PetscCall(PetscMalloc1(n + 1, &rtmp));
621 
622   for (i = 0; i < n; i++) {
623     nz    = bi[i + 1] - bi[i];
624     ajtmp = bj + bi[i];
625     for (j = 0; j < nz; j++) rtmp[ajtmp[j]] = 0.0;
626 
627     /* load in initial (unfactored row) */
628     nz       = ai[r[i] + 1] - ai[r[i]];
629     ajtmpold = aj + ai[r[i]];
630     v        = aa + ai[r[i]];
631     for (j = 0; j < nz; j++) rtmp[ic[ajtmpold[j]]] = v[j];
632 
633     row = *ajtmp++;
634     while (row < i) {
635       pc = rtmp + row;
636       if (*pc != 0.0) {
637         pv         = ba + diag_offset[row];
638         pj         = bj + diag_offset[row] + 1;
639         multiplier = *pc * *pv++;
640         *pc        = multiplier;
641         nz         = bi[row + 1] - diag_offset[row] - 1;
642         for (j = 0; j < nz; j++) rtmp[pj[j]] -= multiplier * pv[j];
643         PetscCall(PetscLogFlops(1.0 + 2.0 * nz));
644       }
645       row = *ajtmp++;
646     }
647     /* finished row so stick it into b->a */
648     pv = ba + bi[i];
649     pj = bj + bi[i];
650     nz = bi[i + 1] - bi[i];
651     for (j = 0; j < nz; j++) pv[j] = rtmp[pj[j]];
652     diag = diag_offset[i] - bi[i];
653     /* check pivot entry for current row */
654     PetscCheck(pv[diag] != 0.0, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Zero pivot: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
655     pv[diag] = 1.0 / pv[diag];
656   }
657 
658   PetscCall(PetscFree(rtmp));
659   PetscCall(ISRestoreIndices(isicol, &ic));
660   PetscCall(ISRestoreIndices(isrow, &r));
661   PetscCall(ISIdentity(isrow, &row_identity));
662   PetscCall(ISIdentity(isicol, &col_identity));
663   if (row_identity && col_identity) {
664     C->ops->solve          = MatSolve_SeqBAIJ_1_NaturalOrdering_inplace;
665     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace;
666   } else {
667     C->ops->solve          = MatSolve_SeqBAIJ_1_inplace;
668     C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_inplace;
669   }
670   C->assembled = PETSC_TRUE;
671   PetscCall(PetscLogFlops(C->cmap->n));
672   PetscFunctionReturn(PETSC_SUCCESS);
673 }
674 
675 static PetscErrorCode MatFactorGetSolverType_petsc(Mat A, MatSolverType *type)
676 {
677   PetscFunctionBegin;
678   *type = MATSOLVERPETSC;
679   PetscFunctionReturn(PETSC_SUCCESS);
680 }
681 
682 PETSC_INTERN PetscErrorCode MatGetFactor_seqbaij_petsc(Mat A, MatFactorType ftype, Mat *B)
683 {
684   PetscInt n = A->rmap->n;
685 
686   PetscFunctionBegin;
687 #if defined(PETSC_USE_COMPLEX)
688   PetscCheck(A->hermitian != PETSC_BOOL3_TRUE || !(ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC), PETSC_COMM_SELF, PETSC_ERR_SUP, "Hermitian Factor is not supported");
689 #endif
690   PetscCall(MatCreate(PetscObjectComm((PetscObject)A), B));
691   PetscCall(MatSetSizes(*B, n, n, n, n));
692   if (ftype == MAT_FACTOR_LU || ftype == MAT_FACTOR_ILU || ftype == MAT_FACTOR_ILUDT) {
693     PetscCall(MatSetType(*B, MATSEQBAIJ));
694 
695     (*B)->ops->lufactorsymbolic  = MatLUFactorSymbolic_SeqBAIJ;
696     (*B)->ops->ilufactorsymbolic = MatILUFactorSymbolic_SeqBAIJ;
697     PetscCall(PetscStrallocpy(MATORDERINGND, (char **)&(*B)->preferredordering[MAT_FACTOR_LU]));
698     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILU]));
699     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ILUDT]));
700   } else if (ftype == MAT_FACTOR_CHOLESKY || ftype == MAT_FACTOR_ICC) {
701     PetscCall(MatSetType(*B, MATSEQSBAIJ));
702     PetscCall(MatSeqSBAIJSetPreallocation(*B, A->rmap->bs, MAT_SKIP_ALLOCATION, NULL));
703 
704     (*B)->ops->iccfactorsymbolic      = MatICCFactorSymbolic_SeqBAIJ;
705     (*B)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqBAIJ;
706     /*  Future optimization would be direct symbolic and numerical factorization for BAIJ to support orderings and Cholesky, instead of first converting to SBAIJ */
707     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_CHOLESKY]));
708     PetscCall(PetscStrallocpy(MATORDERINGNATURAL, (char **)&(*B)->preferredordering[MAT_FACTOR_ICC]));
709   } else SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Factor type not supported");
710   (*B)->factortype     = ftype;
711   (*B)->canuseordering = PETSC_TRUE;
712 
713   PetscCall(PetscFree((*B)->solvertype));
714   PetscCall(PetscStrallocpy(MATSOLVERPETSC, &(*B)->solvertype));
715   PetscCall(PetscObjectComposeFunction((PetscObject)*B, "MatFactorGetSolverType_C", MatFactorGetSolverType_petsc));
716   PetscFunctionReturn(PETSC_SUCCESS);
717 }
718 
719 /* ----------------------------------------------------------- */
720 PetscErrorCode MatLUFactor_SeqBAIJ(Mat A, IS row, IS col, const MatFactorInfo *info)
721 {
722   Mat C;
723 
724   PetscFunctionBegin;
725   PetscCall(MatGetFactor(A, MATSOLVERPETSC, MAT_FACTOR_LU, &C));
726   PetscCall(MatLUFactorSymbolic(C, A, row, col, info));
727   PetscCall(MatLUFactorNumeric(C, A, info));
728 
729   A->ops->solve          = C->ops->solve;
730   A->ops->solvetranspose = C->ops->solvetranspose;
731 
732   PetscCall(MatHeaderMerge(A, &C));
733   PetscFunctionReturn(PETSC_SUCCESS);
734 }
735 
736 #include <../src/mat/impls/sbaij/seq/sbaij.h>
737 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N(Mat C, Mat A, const MatFactorInfo *info)
738 {
739   Mat_SeqBAIJ    *a  = (Mat_SeqBAIJ *)A->data;
740   Mat_SeqSBAIJ   *b  = (Mat_SeqSBAIJ *)C->data;
741   IS              ip = b->row;
742   const PetscInt *rip;
743   PetscInt        i, j, mbs = a->mbs, bs = A->rmap->bs, *bi = b->i, *bj = b->j, *bcol;
744   PetscInt       *ai = a->i, *aj = a->j;
745   PetscInt        k, jmin, jmax, *jl, *il, col, nexti, ili, nz;
746   MatScalar      *rtmp, *ba = b->a, *bval, *aa = a->a, dk, uikdi;
747   PetscReal       rs;
748   FactorShiftCtx  sctx;
749 
750   PetscFunctionBegin;
751   if (bs > 1) { /* convert A to a SBAIJ matrix and apply Cholesky factorization from it */
752     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
753     PetscCall((a->sbaijMat)->ops->choleskyfactornumeric(C, a->sbaijMat, info));
754     PetscCall(MatDestroy(&a->sbaijMat));
755     PetscFunctionReturn(PETSC_SUCCESS);
756   }
757 
758   /* MatPivotSetUp(): initialize shift context sctx */
759   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
760 
761   PetscCall(ISGetIndices(ip, &rip));
762   PetscCall(PetscMalloc3(mbs, &rtmp, mbs, &il, mbs, &jl));
763 
764   sctx.shift_amount = 0.;
765   sctx.nshift       = 0;
766   do {
767     sctx.newshift = PETSC_FALSE;
768     for (i = 0; i < mbs; i++) {
769       rtmp[i] = 0.0;
770       jl[i]   = mbs;
771       il[0]   = 0;
772     }
773 
774     for (k = 0; k < mbs; k++) {
775       bval = ba + bi[k];
776       /* initialize k-th row by the perm[k]-th row of A */
777       jmin = ai[rip[k]];
778       jmax = ai[rip[k] + 1];
779       for (j = jmin; j < jmax; j++) {
780         col = rip[aj[j]];
781         if (col >= k) { /* only take upper triangular entry */
782           rtmp[col] = aa[j];
783           *bval++   = 0.0; /* for in-place factorization */
784         }
785       }
786 
787       /* shift the diagonal of the matrix */
788       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
789 
790       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
791       dk = rtmp[k];
792       i  = jl[k]; /* first row to be added to k_th row  */
793 
794       while (i < k) {
795         nexti = jl[i]; /* next row to be added to k_th row */
796 
797         /* compute multiplier, update diag(k) and U(i,k) */
798         ili   = il[i];                /* index of first nonzero element in U(i,k:bms-1) */
799         uikdi = -ba[ili] * ba[bi[i]]; /* diagonal(k) */
800         dk += uikdi * ba[ili];
801         ba[ili] = uikdi; /* -U(i,k) */
802 
803         /* add multiple of row i to k-th row */
804         jmin = ili + 1;
805         jmax = bi[i + 1];
806         if (jmin < jmax) {
807           for (j = jmin; j < jmax; j++) rtmp[bj[j]] += uikdi * ba[j];
808           /* update il and jl for row i */
809           il[i] = jmin;
810           j     = bj[jmin];
811           jl[i] = jl[j];
812           jl[j] = i;
813         }
814         i = nexti;
815       }
816 
817       /* shift the diagonals when zero pivot is detected */
818       /* compute rs=sum of abs(off-diagonal) */
819       rs   = 0.0;
820       jmin = bi[k] + 1;
821       nz   = bi[k + 1] - jmin;
822       if (nz) {
823         bcol = bj + jmin;
824         while (nz--) {
825           rs += PetscAbsScalar(rtmp[*bcol]);
826           bcol++;
827         }
828       }
829 
830       sctx.rs = rs;
831       sctx.pv = dk;
832       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
833       if (sctx.newshift) break;
834       dk = sctx.pv;
835 
836       /* copy data into U(k,:) */
837       ba[bi[k]] = 1.0 / dk; /* U(k,k) */
838       jmin      = bi[k] + 1;
839       jmax      = bi[k + 1];
840       if (jmin < jmax) {
841         for (j = jmin; j < jmax; j++) {
842           col       = bj[j];
843           ba[j]     = rtmp[col];
844           rtmp[col] = 0.0;
845         }
846         /* add the k-th row into il and jl */
847         il[k] = jmin;
848         i     = bj[jmin];
849         jl[k] = jl[i];
850         jl[i] = k;
851       }
852     }
853   } while (sctx.newshift);
854   PetscCall(PetscFree3(rtmp, il, jl));
855 
856   PetscCall(ISRestoreIndices(ip, &rip));
857 
858   C->assembled    = PETSC_TRUE;
859   C->preallocated = PETSC_TRUE;
860 
861   PetscCall(PetscLogFlops(C->rmap->N));
862   if (sctx.nshift) {
863     if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
864       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
865     } else if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
866       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
867     }
868   }
869   PetscFunctionReturn(PETSC_SUCCESS);
870 }
871 
872 PetscErrorCode MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering(Mat C, Mat A, const MatFactorInfo *info)
873 {
874   Mat_SeqBAIJ   *a = (Mat_SeqBAIJ *)A->data;
875   Mat_SeqSBAIJ  *b = (Mat_SeqSBAIJ *)C->data;
876   PetscInt       i, j, am = a->mbs;
877   PetscInt      *ai = a->i, *aj = a->j, *bi = b->i, *bj = b->j;
878   PetscInt       k, jmin, *jl, *il, nexti, ili, *acol, *bcol, nz;
879   MatScalar     *rtmp, *ba = b->a, *aa = a->a, dk, uikdi, *aval, *bval;
880   PetscReal      rs;
881   FactorShiftCtx sctx;
882 
883   PetscFunctionBegin;
884   /* MatPivotSetUp(): initialize shift context sctx */
885   PetscCall(PetscMemzero(&sctx, sizeof(FactorShiftCtx)));
886 
887   PetscCall(PetscMalloc3(am, &rtmp, am, &il, am, &jl));
888 
889   do {
890     sctx.newshift = PETSC_FALSE;
891     for (i = 0; i < am; i++) {
892       rtmp[i] = 0.0;
893       jl[i]   = am;
894       il[0]   = 0;
895     }
896 
897     for (k = 0; k < am; k++) {
898       /* initialize k-th row with elements nonzero in row perm(k) of A */
899       nz   = ai[k + 1] - ai[k];
900       acol = aj + ai[k];
901       aval = aa + ai[k];
902       bval = ba + bi[k];
903       while (nz--) {
904         if (*acol < k) { /* skip lower triangular entries */
905           acol++;
906           aval++;
907         } else {
908           rtmp[*acol++] = *aval++;
909           *bval++       = 0.0; /* for in-place factorization */
910         }
911       }
912 
913       /* shift the diagonal of the matrix */
914       if (sctx.nshift) rtmp[k] += sctx.shift_amount;
915 
916       /* modify k-th row by adding in those rows i with U(i,k)!=0 */
917       dk = rtmp[k];
918       i  = jl[k]; /* first row to be added to k_th row  */
919 
920       while (i < k) {
921         nexti = jl[i]; /* next row to be added to k_th row */
922         /* compute multiplier, update D(k) and U(i,k) */
923         ili   = il[i]; /* index of first nonzero element in U(i,k:bms-1) */
924         uikdi = -ba[ili] * ba[bi[i]];
925         dk += uikdi * ba[ili];
926         ba[ili] = uikdi; /* -U(i,k) */
927 
928         /* add multiple of row i to k-th row ... */
929         jmin = ili + 1;
930         nz   = bi[i + 1] - jmin;
931         if (nz > 0) {
932           bcol = bj + jmin;
933           bval = ba + jmin;
934           while (nz--) rtmp[*bcol++] += uikdi * (*bval++);
935           /* update il and jl for i-th row */
936           il[i] = jmin;
937           j     = bj[jmin];
938           jl[i] = jl[j];
939           jl[j] = i;
940         }
941         i = nexti;
942       }
943 
944       /* shift the diagonals when zero pivot is detected */
945       /* compute rs=sum of abs(off-diagonal) */
946       rs   = 0.0;
947       jmin = bi[k] + 1;
948       nz   = bi[k + 1] - jmin;
949       if (nz) {
950         bcol = bj + jmin;
951         while (nz--) {
952           rs += PetscAbsScalar(rtmp[*bcol]);
953           bcol++;
954         }
955       }
956 
957       sctx.rs = rs;
958       sctx.pv = dk;
959       PetscCall(MatPivotCheck(C, A, info, &sctx, k));
960       if (sctx.newshift) break; /* sctx.shift_amount is updated */
961       dk = sctx.pv;
962 
963       /* copy data into U(k,:) */
964       ba[bi[k]] = 1.0 / dk;
965       jmin      = bi[k] + 1;
966       nz        = bi[k + 1] - jmin;
967       if (nz) {
968         bcol = bj + jmin;
969         bval = ba + jmin;
970         while (nz--) {
971           *bval++       = rtmp[*bcol];
972           rtmp[*bcol++] = 0.0;
973         }
974         /* add k-th row into il and jl */
975         il[k] = jmin;
976         i     = bj[jmin];
977         jl[k] = jl[i];
978         jl[i] = k;
979       }
980     }
981   } while (sctx.newshift);
982   PetscCall(PetscFree3(rtmp, il, jl));
983 
984   C->ops->solve          = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
985   C->ops->solvetranspose = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
986   C->assembled           = PETSC_TRUE;
987   C->preallocated        = PETSC_TRUE;
988 
989   PetscCall(PetscLogFlops(C->rmap->N));
990   if (sctx.nshift) {
991     if (info->shifttype == (PetscReal)MAT_SHIFT_NONZERO) {
992       PetscCall(PetscInfo(A, "number of shiftnz tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
993     } else if (info->shifttype == (PetscReal)MAT_SHIFT_POSITIVE_DEFINITE) {
994       PetscCall(PetscInfo(A, "number of shiftpd tries %" PetscInt_FMT ", shift_amount %g\n", sctx.nshift, (double)sctx.shift_amount));
995     }
996   }
997   PetscFunctionReturn(PETSC_SUCCESS);
998 }
999 
1000 #include <petscbt.h>
1001 #include <../src/mat/utils/freespace.h>
1002 PetscErrorCode MatICCFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1003 {
1004   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1005   Mat_SeqSBAIJ      *b;
1006   Mat                B;
1007   PetscBool          perm_identity, missing;
1008   PetscInt           reallocs = 0, i, *ai = a->i, *aj = a->j, am = a->mbs, bs = A->rmap->bs, *ui;
1009   const PetscInt    *rip;
1010   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
1011   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, ncols_upper, *cols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
1012   PetscReal          fill = info->fill, levels = info->levels;
1013   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1014   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
1015   PetscBT            lnkbt;
1016 
1017   PetscFunctionBegin;
1018   PetscCall(MatMissingDiagonal(A, &missing, &i));
1019   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1020 
1021   if (bs > 1) {
1022     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1023     (fact)->ops->iccfactorsymbolic = MatICCFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1024 
1025     PetscCall(MatICCFactorSymbolic(fact, a->sbaijMat, perm, info));
1026     PetscFunctionReturn(PETSC_SUCCESS);
1027   }
1028 
1029   PetscCall(ISIdentity(perm, &perm_identity));
1030   PetscCall(ISGetIndices(perm, &rip));
1031 
1032   /* special case that simply copies fill pattern */
1033   if (!levels && perm_identity) {
1034     PetscCall(PetscMalloc1(am + 1, &ui));
1035     for (i = 0; i < am; i++) ui[i] = ai[i + 1] - a->diag[i]; /* ui: rowlengths - changes when !perm_identity */
1036     B = fact;
1037     PetscCall(MatSeqSBAIJSetPreallocation(B, 1, 0, ui));
1038 
1039     b  = (Mat_SeqSBAIJ *)B->data;
1040     uj = b->j;
1041     for (i = 0; i < am; i++) {
1042       aj = a->j + a->diag[i];
1043       for (j = 0; j < ui[i]; j++) *uj++ = *aj++;
1044       b->ilen[i] = ui[i];
1045     }
1046     PetscCall(PetscFree(ui));
1047 
1048     B->factortype = MAT_FACTOR_NONE;
1049 
1050     PetscCall(MatAssemblyBegin(B, MAT_FINAL_ASSEMBLY));
1051     PetscCall(MatAssemblyEnd(B, MAT_FINAL_ASSEMBLY));
1052     B->factortype = MAT_FACTOR_ICC;
1053 
1054     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1055     PetscFunctionReturn(PETSC_SUCCESS);
1056   }
1057 
1058   /* initialization */
1059   PetscCall(PetscMalloc1(am + 1, &ui));
1060   ui[0] = 0;
1061   PetscCall(PetscMalloc1(2 * am + 1, &cols_lvl));
1062 
1063   /* jl: linked list for storing indices of the pivot rows
1064      il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
1065   PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
1066   for (i = 0; i < am; i++) {
1067     jl[i] = am;
1068     il[i] = 0;
1069   }
1070 
1071   /* create and initialize a linked list for storing column indices of the active row k */
1072   nlnk = am + 1;
1073   PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
1074 
1075   /* initial FreeSpace size is fill*(ai[am]+am)/2 */
1076   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space));
1077 
1078   current_space = free_space;
1079 
1080   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[am] / 2, am / 2)), &free_space_lvl));
1081   current_space_lvl = free_space_lvl;
1082 
1083   for (k = 0; k < am; k++) { /* for each active row k */
1084     /* initialize lnk by the column indices of row rip[k] of A */
1085     nzk         = 0;
1086     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1087     ncols_upper = 0;
1088     cols        = cols_lvl + am;
1089     for (j = 0; j < ncols; j++) {
1090       i = rip[*(aj + ai[rip[k]] + j)];
1091       if (i >= k) { /* only take upper triangular entry */
1092         cols[ncols_upper]     = i;
1093         cols_lvl[ncols_upper] = -1; /* initialize level for nonzero entries */
1094         ncols_upper++;
1095       }
1096     }
1097     PetscCall(PetscIncompleteLLAdd(ncols_upper, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1098     nzk += nlnk;
1099 
1100     /* update lnk by computing fill-in for each pivot row to be merged in */
1101     prow = jl[k]; /* 1st pivot row */
1102 
1103     while (prow < k) {
1104       nextprow = jl[prow];
1105 
1106       /* merge prow into k-th row */
1107       jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
1108       jmax  = ui[prow + 1];
1109       ncols = jmax - jmin;
1110       i     = jmin - ui[prow];
1111       cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
1112       for (j = 0; j < ncols; j++) cols_lvl[j] = *(uj_lvl_ptr[prow] + i + j);
1113       PetscCall(PetscIncompleteLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt));
1114       nzk += nlnk;
1115 
1116       /* update il and jl for prow */
1117       if (jmin < jmax) {
1118         il[prow] = jmin;
1119 
1120         j        = *cols;
1121         jl[prow] = jl[j];
1122         jl[j]    = prow;
1123       }
1124       prow = nextprow;
1125     }
1126 
1127     /* if free space is not available, make more free space */
1128     if (current_space->local_remaining < nzk) {
1129       i = am - k + 1;                                                             /* num of unfactored rows */
1130       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1131       PetscCall(PetscFreeSpaceGet(i, &current_space));
1132       PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
1133       reallocs++;
1134     }
1135 
1136     /* copy data into free_space and free_space_lvl, then initialize lnk */
1137     PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
1138 
1139     /* add the k-th row into il and jl */
1140     if (nzk - 1 > 0) {
1141       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
1142       jl[k] = jl[i];
1143       jl[i] = k;
1144       il[k] = ui[k] + 1;
1145     }
1146     uj_ptr[k]     = current_space->array;
1147     uj_lvl_ptr[k] = current_space_lvl->array;
1148 
1149     current_space->array += nzk;
1150     current_space->local_used += nzk;
1151     current_space->local_remaining -= nzk;
1152 
1153     current_space_lvl->array += nzk;
1154     current_space_lvl->local_used += nzk;
1155     current_space_lvl->local_remaining -= nzk;
1156 
1157     ui[k + 1] = ui[k] + nzk;
1158   }
1159 
1160   PetscCall(ISRestoreIndices(perm, &rip));
1161   PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
1162   PetscCall(PetscFree(cols_lvl));
1163 
1164   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1165   PetscCall(PetscMalloc1(ui[am] + 1, &uj));
1166   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1167   PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
1168   PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
1169 
1170   /* put together the new matrix in MATSEQSBAIJ format */
1171   B = fact;
1172   PetscCall(MatSeqSBAIJSetPreallocation(B, 1, MAT_SKIP_ALLOCATION, NULL));
1173 
1174   b               = (Mat_SeqSBAIJ *)B->data;
1175   b->singlemalloc = PETSC_FALSE;
1176   b->free_a       = PETSC_TRUE;
1177   b->free_ij      = PETSC_TRUE;
1178 
1179   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
1180 
1181   b->j             = uj;
1182   b->i             = ui;
1183   b->diag          = NULL;
1184   b->ilen          = NULL;
1185   b->imax          = NULL;
1186   b->row           = perm;
1187   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1188 
1189   PetscCall(PetscObjectReference((PetscObject)perm));
1190 
1191   b->icol = perm;
1192 
1193   PetscCall(PetscObjectReference((PetscObject)perm));
1194   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
1195 
1196   b->maxnz = b->nz = ui[am];
1197 
1198   B->info.factor_mallocs   = reallocs;
1199   B->info.fill_ratio_given = fill;
1200   if (ai[am] != 0.) {
1201     /* nonzeros in lower triangular part of A (including diagonals)= (ai[am]+am)/2 */
1202     B->info.fill_ratio_needed = ((PetscReal)2 * ui[am]) / (ai[am] + am);
1203   } else {
1204     B->info.fill_ratio_needed = 0.0;
1205   }
1206 #if defined(PETSC_USE_INFO)
1207   if (ai[am] != 0) {
1208     PetscReal af = B->info.fill_ratio_needed;
1209     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1210     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1211     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1212   } else {
1213     PetscCall(PetscInfo(A, "Empty matrix\n"));
1214   }
1215 #endif
1216   if (perm_identity) {
1217     B->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1218     B->ops->solvetranspose        = MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace;
1219     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1220   } else {
1221     (fact)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1222   }
1223   PetscFunctionReturn(PETSC_SUCCESS);
1224 }
1225 
1226 PetscErrorCode MatCholeskyFactorSymbolic_SeqBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
1227 {
1228   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1229   Mat_SeqSBAIJ      *b;
1230   Mat                B;
1231   PetscBool          perm_identity, missing;
1232   PetscReal          fill = info->fill;
1233   const PetscInt    *rip;
1234   PetscInt           i, mbs = a->mbs, bs = A->rmap->bs, *ai = a->i, *aj = a->j, reallocs = 0, prow;
1235   PetscInt          *jl, jmin, jmax, nzk, *ui, k, j, *il, nextprow;
1236   PetscInt           nlnk, *lnk, ncols, ncols_upper, *cols, *uj, **ui_ptr, *uj_ptr;
1237   PetscFreeSpaceList free_space = NULL, current_space = NULL;
1238   PetscBT            lnkbt;
1239 
1240   PetscFunctionBegin;
1241   if (bs > 1) { /* convert to seqsbaij */
1242     if (!a->sbaijMat) PetscCall(MatConvert(A, MATSEQSBAIJ, MAT_INITIAL_MATRIX, &a->sbaijMat));
1243     (fact)->ops->choleskyfactorsymbolic = MatCholeskyFactorSymbolic_SeqSBAIJ; /* undue the change made in MatGetFactor_seqbaij_petsc */
1244 
1245     PetscCall(MatCholeskyFactorSymbolic(fact, a->sbaijMat, perm, info));
1246     PetscFunctionReturn(PETSC_SUCCESS);
1247   }
1248 
1249   PetscCall(MatMissingDiagonal(A, &missing, &i));
1250   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1251 
1252   /* check whether perm is the identity mapping */
1253   PetscCall(ISIdentity(perm, &perm_identity));
1254   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported");
1255   PetscCall(ISGetIndices(perm, &rip));
1256 
1257   /* initialization */
1258   PetscCall(PetscMalloc1(mbs + 1, &ui));
1259   ui[0] = 0;
1260 
1261   /* jl: linked list for storing indices of the pivot rows
1262      il: il[i] points to the 1st nonzero entry of U(i,k:mbs-1) */
1263   PetscCall(PetscMalloc4(mbs, &ui_ptr, mbs, &il, mbs, &jl, mbs, &cols));
1264   for (i = 0; i < mbs; i++) {
1265     jl[i] = mbs;
1266     il[i] = 0;
1267   }
1268 
1269   /* create and initialize a linked list for storing column indices of the active row k */
1270   nlnk = mbs + 1;
1271   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1272 
1273   /* initial FreeSpace size is fill* (ai[mbs]+mbs)/2 */
1274   PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, PetscIntSumTruncate(ai[mbs] / 2, mbs / 2)), &free_space));
1275 
1276   current_space = free_space;
1277 
1278   for (k = 0; k < mbs; k++) { /* for each active row k */
1279     /* initialize lnk by the column indices of row rip[k] of A */
1280     nzk         = 0;
1281     ncols       = ai[rip[k] + 1] - ai[rip[k]];
1282     ncols_upper = 0;
1283     for (j = 0; j < ncols; j++) {
1284       i = rip[*(aj + ai[rip[k]] + j)];
1285       if (i >= k) { /* only take upper triangular entry */
1286         cols[ncols_upper] = i;
1287         ncols_upper++;
1288       }
1289     }
1290     PetscCall(PetscLLAdd(ncols_upper, cols, mbs, &nlnk, lnk, lnkbt));
1291     nzk += nlnk;
1292 
1293     /* update lnk by computing fill-in for each pivot row to be merged in */
1294     prow = jl[k]; /* 1st pivot row */
1295 
1296     while (prow < k) {
1297       nextprow = jl[prow];
1298       /* merge prow into k-th row */
1299       jmin   = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:mbs-1) */
1300       jmax   = ui[prow + 1];
1301       ncols  = jmax - jmin;
1302       uj_ptr = ui_ptr[prow] + jmin - ui[prow]; /* points to the 2nd nzero entry in U(prow,k:mbs-1) */
1303       PetscCall(PetscLLAddSorted(ncols, uj_ptr, mbs, &nlnk, lnk, lnkbt));
1304       nzk += nlnk;
1305 
1306       /* update il and jl for prow */
1307       if (jmin < jmax) {
1308         il[prow] = jmin;
1309         j        = *uj_ptr;
1310         jl[prow] = jl[j];
1311         jl[j]    = prow;
1312       }
1313       prow = nextprow;
1314     }
1315 
1316     /* if free space is not available, make more free space */
1317     if (current_space->local_remaining < nzk) {
1318       i = mbs - k + 1;                                                            /* num of unfactored rows */
1319       i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
1320       PetscCall(PetscFreeSpaceGet(i, &current_space));
1321       reallocs++;
1322     }
1323 
1324     /* copy data into free space, then initialize lnk */
1325     PetscCall(PetscLLClean(mbs, mbs, nzk, lnk, current_space->array, lnkbt));
1326 
1327     /* add the k-th row into il and jl */
1328     if (nzk - 1 > 0) {
1329       i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:mbs-1) */
1330       jl[k] = jl[i];
1331       jl[i] = k;
1332       il[k] = ui[k] + 1;
1333     }
1334     ui_ptr[k] = current_space->array;
1335     current_space->array += nzk;
1336     current_space->local_used += nzk;
1337     current_space->local_remaining -= nzk;
1338 
1339     ui[k + 1] = ui[k] + nzk;
1340   }
1341 
1342   PetscCall(ISRestoreIndices(perm, &rip));
1343   PetscCall(PetscFree4(ui_ptr, il, jl, cols));
1344 
1345   /* copy free_space into uj and free free_space; set uj in new datastructure; */
1346   PetscCall(PetscMalloc1(ui[mbs] + 1, &uj));
1347   PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
1348   PetscCall(PetscLLDestroy(lnk, lnkbt));
1349 
1350   /* put together the new matrix in MATSEQSBAIJ format */
1351   B = fact;
1352   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1353 
1354   b               = (Mat_SeqSBAIJ *)B->data;
1355   b->singlemalloc = PETSC_FALSE;
1356   b->free_a       = PETSC_TRUE;
1357   b->free_ij      = PETSC_TRUE;
1358 
1359   PetscCall(PetscMalloc1(ui[mbs] + 1, &b->a));
1360 
1361   b->j             = uj;
1362   b->i             = ui;
1363   b->diag          = NULL;
1364   b->ilen          = NULL;
1365   b->imax          = NULL;
1366   b->row           = perm;
1367   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
1368 
1369   PetscCall(PetscObjectReference((PetscObject)perm));
1370   b->icol = perm;
1371   PetscCall(PetscObjectReference((PetscObject)perm));
1372   PetscCall(PetscMalloc1(mbs + 1, &b->solve_work));
1373   b->maxnz = b->nz = ui[mbs];
1374 
1375   B->info.factor_mallocs   = reallocs;
1376   B->info.fill_ratio_given = fill;
1377   if (ai[mbs] != 0.) {
1378     /* nonzeros in lower triangular part of A = (ai[mbs]+mbs)/2 */
1379     B->info.fill_ratio_needed = ((PetscReal)2 * ui[mbs]) / (ai[mbs] + mbs);
1380   } else {
1381     B->info.fill_ratio_needed = 0.0;
1382   }
1383 #if defined(PETSC_USE_INFO)
1384   if (ai[mbs] != 0.) {
1385     PetscReal af = B->info.fill_ratio_needed;
1386     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
1387     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
1388     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
1389   } else {
1390     PetscCall(PetscInfo(A, "Empty matrix\n"));
1391   }
1392 #endif
1393   if (perm_identity) {
1394     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N_NaturalOrdering;
1395   } else {
1396     B->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqBAIJ_N;
1397   }
1398   PetscFunctionReturn(PETSC_SUCCESS);
1399 }
1400 
1401 PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering(Mat A, Vec bb, Vec xx)
1402 {
1403   Mat_SeqBAIJ       *a  = (Mat_SeqBAIJ *)A->data;
1404   const PetscInt    *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1405   PetscInt           i, k, n                       = a->mbs;
1406   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1407   const MatScalar   *aa = a->a, *v;
1408   PetscScalar       *x, *s, *t, *ls;
1409   const PetscScalar *b;
1410 
1411   PetscFunctionBegin;
1412   PetscCall(VecGetArrayRead(bb, &b));
1413   PetscCall(VecGetArray(xx, &x));
1414   t = a->solve_work;
1415 
1416   /* forward solve the lower triangular */
1417   PetscCall(PetscArraycpy(t, b, bs)); /* copy 1st block of b to t */
1418 
1419   for (i = 1; i < n; i++) {
1420     v  = aa + bs2 * ai[i];
1421     vi = aj + ai[i];
1422     nz = ai[i + 1] - ai[i];
1423     s  = t + bs * i;
1424     PetscCall(PetscArraycpy(s, b + bs * i, bs)); /* copy i_th block of b to t */
1425     for (k = 0; k < nz; k++) {
1426       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[k]);
1427       v += bs2;
1428     }
1429   }
1430 
1431   /* backward solve the upper triangular */
1432   ls = a->solve_work + A->cmap->n;
1433   for (i = n - 1; i >= 0; i--) {
1434     v  = aa + bs2 * (adiag[i + 1] + 1);
1435     vi = aj + adiag[i + 1] + 1;
1436     nz = adiag[i] - adiag[i + 1] - 1;
1437     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1438     for (k = 0; k < nz; k++) {
1439       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[k]);
1440       v += bs2;
1441     }
1442     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * adiag[i], t + i * bs); /* *inv(diagonal[i]) */
1443     PetscCall(PetscArraycpy(x + i * bs, t + i * bs, bs));
1444   }
1445 
1446   PetscCall(VecRestoreArrayRead(bb, &b));
1447   PetscCall(VecRestoreArray(xx, &x));
1448   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1449   PetscFunctionReturn(PETSC_SUCCESS);
1450 }
1451 
1452 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A, Vec bb, Vec xx)
1453 {
1454   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
1455   IS                 iscol = a->col, isrow = a->row;
1456   const PetscInt    *r, *c, *rout, *cout, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi;
1457   PetscInt           i, m, n = a->mbs;
1458   PetscInt           nz, bs = A->rmap->bs, bs2 = a->bs2;
1459   const MatScalar   *aa = a->a, *v;
1460   PetscScalar       *x, *s, *t, *ls;
1461   const PetscScalar *b;
1462 
1463   PetscFunctionBegin;
1464   PetscCall(VecGetArrayRead(bb, &b));
1465   PetscCall(VecGetArray(xx, &x));
1466   t = a->solve_work;
1467 
1468   PetscCall(ISGetIndices(isrow, &rout));
1469   r = rout;
1470   PetscCall(ISGetIndices(iscol, &cout));
1471   c = cout;
1472 
1473   /* forward solve the lower triangular */
1474   PetscCall(PetscArraycpy(t, b + bs * r[0], bs));
1475   for (i = 1; i < n; i++) {
1476     v  = aa + bs2 * ai[i];
1477     vi = aj + ai[i];
1478     nz = ai[i + 1] - ai[i];
1479     s  = t + bs * i;
1480     PetscCall(PetscArraycpy(s, b + bs * r[i], bs));
1481     for (m = 0; m < nz; m++) {
1482       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * vi[m]);
1483       v += bs2;
1484     }
1485   }
1486 
1487   /* backward solve the upper triangular */
1488   ls = a->solve_work + A->cmap->n;
1489   for (i = n - 1; i >= 0; i--) {
1490     v  = aa + bs2 * (adiag[i + 1] + 1);
1491     vi = aj + adiag[i + 1] + 1;
1492     nz = adiag[i] - adiag[i + 1] - 1;
1493     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
1494     for (m = 0; m < nz; m++) {
1495       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * vi[m]);
1496       v += bs2;
1497     }
1498     PetscKernel_w_gets_A_times_v(bs, ls, v, t + i * bs); /* *inv(diagonal[i]) */
1499     PetscCall(PetscArraycpy(x + bs * c[i], t + i * bs, bs));
1500   }
1501   PetscCall(ISRestoreIndices(isrow, &rout));
1502   PetscCall(ISRestoreIndices(iscol, &cout));
1503   PetscCall(VecRestoreArrayRead(bb, &b));
1504   PetscCall(VecRestoreArray(xx, &x));
1505   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
1506   PetscFunctionReturn(PETSC_SUCCESS);
1507 }
1508 
1509 /*
1510     For each block in an block array saves the largest absolute value in the block into another array
1511 */
1512 static PetscErrorCode MatBlockAbs_private(PetscInt nbs, PetscInt bs2, PetscScalar *blockarray, PetscReal *absarray)
1513 {
1514   PetscInt i, j;
1515 
1516   PetscFunctionBegin;
1517   PetscCall(PetscArrayzero(absarray, nbs + 1));
1518   for (i = 0; i < nbs; i++) {
1519     for (j = 0; j < bs2; j++) {
1520       if (absarray[i] < PetscAbsScalar(blockarray[i * nbs + j])) absarray[i] = PetscAbsScalar(blockarray[i * nbs + j]);
1521     }
1522   }
1523   PetscFunctionReturn(PETSC_SUCCESS);
1524 }
1525 
1526 /*
1527      This needs to be renamed and called by the regular MatILUFactor_SeqBAIJ when drop tolerance is used
1528 */
1529 PetscErrorCode MatILUDTFactor_SeqBAIJ(Mat A, IS isrow, IS iscol, const MatFactorInfo *info, Mat *fact)
1530 {
1531   Mat             B = *fact;
1532   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data, *b;
1533   IS              isicol;
1534   const PetscInt *r, *ic;
1535   PetscInt        i, mbs = a->mbs, bs = A->rmap->bs, bs2 = a->bs2, *ai = a->i, *aj = a->j, *ajtmp, *adiag;
1536   PetscInt       *bi, *bj, *bdiag;
1537 
1538   PetscInt   row, nzi, nzi_bl, nzi_bu, *im, dtcount, nzi_al, nzi_au;
1539   PetscInt   nlnk, *lnk;
1540   PetscBT    lnkbt;
1541   PetscBool  row_identity, icol_identity;
1542   MatScalar *aatmp, *pv, *batmp, *ba, *rtmp, *pc, *multiplier, *vtmp;
1543   PetscInt   j, nz, *pj, *bjtmp, k, ncut, *jtmp;
1544 
1545   PetscReal  dt = info->dt; /* shift=info->shiftamount; */
1546   PetscInt   nnz_max;
1547   PetscBool  missing;
1548   PetscReal *vtmp_abs;
1549   MatScalar *v_work;
1550   PetscInt  *v_pivots;
1551   PetscBool  allowzeropivot, zeropivotdetected = PETSC_FALSE;
1552 
1553   PetscFunctionBegin;
1554   /* ------- symbolic factorization, can be reused ---------*/
1555   PetscCall(MatMissingDiagonal(A, &missing, &i));
1556   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, i);
1557   adiag = a->diag;
1558 
1559   PetscCall(ISInvertPermutation(iscol, PETSC_DECIDE, &isicol));
1560 
1561   /* bdiag is location of diagonal in factor */
1562   PetscCall(PetscMalloc1(mbs + 1, &bdiag));
1563 
1564   /* allocate row pointers bi */
1565   PetscCall(PetscMalloc1(2 * mbs + 2, &bi));
1566 
1567   /* allocate bj and ba; max num of nonzero entries is (ai[n]+2*n*dtcount+2) */
1568   dtcount = (PetscInt)info->dtcount;
1569   if (dtcount > mbs - 1) dtcount = mbs - 1;
1570   nnz_max = ai[mbs] + 2 * mbs * dtcount + 2;
1571   /* printf("MatILUDTFactor_SeqBAIJ, bs %d, ai[mbs] %d, nnz_max  %d, dtcount %d\n",bs,ai[mbs],nnz_max,dtcount); */
1572   PetscCall(PetscMalloc1(nnz_max, &bj));
1573   nnz_max = nnz_max * bs2;
1574   PetscCall(PetscMalloc1(nnz_max, &ba));
1575 
1576   /* put together the new matrix */
1577   PetscCall(MatSeqBAIJSetPreallocation(B, bs, MAT_SKIP_ALLOCATION, NULL));
1578 
1579   b               = (Mat_SeqBAIJ *)(B)->data;
1580   b->free_a       = PETSC_TRUE;
1581   b->free_ij      = PETSC_TRUE;
1582   b->singlemalloc = PETSC_FALSE;
1583 
1584   b->a    = ba;
1585   b->j    = bj;
1586   b->i    = bi;
1587   b->diag = bdiag;
1588   b->ilen = NULL;
1589   b->imax = NULL;
1590   b->row  = isrow;
1591   b->col  = iscol;
1592 
1593   PetscCall(PetscObjectReference((PetscObject)isrow));
1594   PetscCall(PetscObjectReference((PetscObject)iscol));
1595 
1596   b->icol = isicol;
1597   PetscCall(PetscMalloc1(bs * (mbs + 1), &b->solve_work));
1598   b->maxnz = nnz_max / bs2;
1599 
1600   (B)->factortype            = MAT_FACTOR_ILUDT;
1601   (B)->info.factor_mallocs   = 0;
1602   (B)->info.fill_ratio_given = ((PetscReal)nnz_max) / ((PetscReal)(ai[mbs] * bs2));
1603   /* ------- end of symbolic factorization ---------*/
1604   PetscCall(ISGetIndices(isrow, &r));
1605   PetscCall(ISGetIndices(isicol, &ic));
1606 
1607   /* linked list for storing column indices of the active row */
1608   nlnk = mbs + 1;
1609   PetscCall(PetscLLCreate(mbs, mbs, nlnk, lnk, lnkbt));
1610 
1611   /* im: used by PetscLLAddSortedLU(); jtmp: working array for column indices of active row */
1612   PetscCall(PetscMalloc2(mbs, &im, mbs, &jtmp));
1613   /* rtmp, vtmp: working arrays for sparse and contiguous row entries of active row */
1614   PetscCall(PetscMalloc2(mbs * bs2, &rtmp, mbs * bs2, &vtmp));
1615   PetscCall(PetscMalloc1(mbs + 1, &vtmp_abs));
1616   PetscCall(PetscMalloc3(bs, &v_work, bs2, &multiplier, bs, &v_pivots));
1617 
1618   allowzeropivot  = PetscNot(A->erroriffailure);
1619   bi[0]           = 0;
1620   bdiag[0]        = (nnz_max / bs2) - 1; /* location of diagonal in factor B */
1621   bi[2 * mbs + 1] = bdiag[0] + 1;        /* endof bj and ba array */
1622   for (i = 0; i < mbs; i++) {
1623     /* copy initial fill into linked list */
1624     nzi = ai[r[i] + 1] - ai[r[i]];
1625     PetscCheck(nzi, PETSC_COMM_SELF, PETSC_ERR_MAT_LU_ZRPVT, "Empty row in matrix: row in original ordering %" PetscInt_FMT " in permuted ordering %" PetscInt_FMT, r[i], i);
1626     nzi_al = adiag[r[i]] - ai[r[i]];
1627     nzi_au = ai[r[i] + 1] - adiag[r[i]] - 1;
1628 
1629     /* load in initial unfactored row */
1630     ajtmp = aj + ai[r[i]];
1631     PetscCall(PetscLLAddPerm(nzi, ajtmp, ic, mbs, &nlnk, lnk, lnkbt));
1632     PetscCall(PetscArrayzero(rtmp, mbs * bs2));
1633     aatmp = a->a + bs2 * ai[r[i]];
1634     for (j = 0; j < nzi; j++) PetscCall(PetscArraycpy(rtmp + bs2 * ic[ajtmp[j]], aatmp + bs2 * j, bs2));
1635 
1636     /* add pivot rows into linked list */
1637     row = lnk[mbs];
1638     while (row < i) {
1639       nzi_bl = bi[row + 1] - bi[row] + 1;
1640       bjtmp  = bj + bdiag[row + 1] + 1; /* points to 1st column next to the diagonal in U */
1641       PetscCall(PetscLLAddSortedLU(bjtmp, row, &nlnk, lnk, lnkbt, i, nzi_bl, im));
1642       nzi += nlnk;
1643       row = lnk[row];
1644     }
1645 
1646     /* copy data from lnk into jtmp, then initialize lnk */
1647     PetscCall(PetscLLClean(mbs, mbs, nzi, lnk, jtmp, lnkbt));
1648 
1649     /* numerical factorization */
1650     bjtmp = jtmp;
1651     row   = *bjtmp++; /* 1st pivot row */
1652 
1653     while (row < i) {
1654       pc = rtmp + bs2 * row;
1655       pv = ba + bs2 * bdiag[row];                           /* inv(diag) of the pivot row */
1656       PetscKernel_A_gets_A_times_B(bs, pc, pv, multiplier); /* pc= multiplier = pc*inv(diag[row]) */
1657       PetscCall(MatBlockAbs_private(1, bs2, pc, vtmp_abs));
1658       if (vtmp_abs[0] > dt) {         /* apply tolerance dropping rule */
1659         pj = bj + bdiag[row + 1] + 1; /* point to 1st entry of U(row,:) */
1660         pv = ba + bs2 * (bdiag[row + 1] + 1);
1661         nz = bdiag[row] - bdiag[row + 1] - 1; /* num of entries in U(row,:), excluding diagonal */
1662         for (j = 0; j < nz; j++) PetscKernel_A_gets_A_minus_B_times_C(bs, rtmp + bs2 * pj[j], pc, pv + bs2 * j);
1663         /* PetscCall(PetscLogFlops(bslog*(nz+1.0)-bs)); */
1664       }
1665       row = *bjtmp++;
1666     }
1667 
1668     /* copy sparse rtmp into contiguous vtmp; separate L and U part */
1669     nzi_bl = 0;
1670     j      = 0;
1671     while (jtmp[j] < i) { /* L-part. Note: jtmp is sorted */
1672       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1673       nzi_bl++;
1674       j++;
1675     }
1676     nzi_bu = nzi - nzi_bl - 1;
1677 
1678     while (j < nzi) { /* U-part */
1679       PetscCall(PetscArraycpy(vtmp + bs2 * j, rtmp + bs2 * jtmp[j], bs2));
1680       j++;
1681     }
1682 
1683     PetscCall(MatBlockAbs_private(nzi, bs2, vtmp, vtmp_abs));
1684 
1685     bjtmp = bj + bi[i];
1686     batmp = ba + bs2 * bi[i];
1687     /* apply level dropping rule to L part */
1688     ncut = nzi_al + dtcount;
1689     if (ncut < nzi_bl) {
1690       PetscCall(PetscSortSplitReal(ncut, nzi_bl, vtmp_abs, jtmp));
1691       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp, vtmp));
1692     } else {
1693       ncut = nzi_bl;
1694     }
1695     for (j = 0; j < ncut; j++) {
1696       bjtmp[j] = jtmp[j];
1697       PetscCall(PetscArraycpy(batmp + bs2 * j, rtmp + bs2 * bjtmp[j], bs2));
1698     }
1699     bi[i + 1] = bi[i] + ncut;
1700     nzi       = ncut + 1;
1701 
1702     /* apply level dropping rule to U part */
1703     ncut = nzi_au + dtcount;
1704     if (ncut < nzi_bu) {
1705       PetscCall(PetscSortSplitReal(ncut, nzi_bu, vtmp_abs + nzi_bl + 1, jtmp + nzi_bl + 1));
1706       PetscCall(PetscSortIntWithScalarArray(ncut, jtmp + nzi_bl + 1, vtmp + nzi_bl + 1));
1707     } else {
1708       ncut = nzi_bu;
1709     }
1710     nzi += ncut;
1711 
1712     /* mark bdiagonal */
1713     bdiag[i + 1]    = bdiag[i] - (ncut + 1);
1714     bi[2 * mbs - i] = bi[2 * mbs - i + 1] - (ncut + 1);
1715 
1716     bjtmp = bj + bdiag[i];
1717     batmp = ba + bs2 * bdiag[i];
1718     PetscCall(PetscArraycpy(batmp, rtmp + bs2 * i, bs2));
1719     *bjtmp = i;
1720 
1721     bjtmp = bj + bdiag[i + 1] + 1;
1722     batmp = ba + (bdiag[i + 1] + 1) * bs2;
1723 
1724     for (k = 0; k < ncut; k++) {
1725       bjtmp[k] = jtmp[nzi_bl + 1 + k];
1726       PetscCall(PetscArraycpy(batmp + bs2 * k, rtmp + bs2 * bjtmp[k], bs2));
1727     }
1728 
1729     im[i] = nzi; /* used by PetscLLAddSortedLU() */
1730 
1731     /* invert diagonal block for simpler triangular solves - add shift??? */
1732     batmp = ba + bs2 * bdiag[i];
1733 
1734     PetscCall(PetscKernel_A_gets_inverse_A(bs, batmp, v_pivots, v_work, allowzeropivot, &zeropivotdetected));
1735     if (zeropivotdetected) B->factorerrortype = MAT_FACTOR_NUMERIC_ZEROPIVOT;
1736   } /* for (i=0; i<mbs; i++) */
1737   PetscCall(PetscFree3(v_work, multiplier, v_pivots));
1738 
1739   /* printf("end of L %d, beginning of U %d\n",bi[mbs],bdiag[mbs]); */
1740   PetscCheck(bi[mbs] < bdiag[mbs], PETSC_COMM_SELF, PETSC_ERR_ARG_SIZ, "end of L array %" PetscInt_FMT " cannot >= the beginning of U array %" PetscInt_FMT, bi[mbs], bdiag[mbs]);
1741 
1742   PetscCall(ISRestoreIndices(isrow, &r));
1743   PetscCall(ISRestoreIndices(isicol, &ic));
1744 
1745   PetscCall(PetscLLDestroy(lnk, lnkbt));
1746 
1747   PetscCall(PetscFree2(im, jtmp));
1748   PetscCall(PetscFree2(rtmp, vtmp));
1749 
1750   PetscCall(PetscLogFlops(bs2 * B->cmap->n));
1751   b->maxnz = b->nz = bi[mbs] + bdiag[0] - bdiag[mbs];
1752 
1753   PetscCall(ISIdentity(isrow, &row_identity));
1754   PetscCall(ISIdentity(isicol, &icol_identity));
1755   if (row_identity && icol_identity) {
1756     B->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
1757   } else {
1758     B->ops->solve = MatSolve_SeqBAIJ_N;
1759   }
1760 
1761   B->ops->solveadd          = NULL;
1762   B->ops->solvetranspose    = NULL;
1763   B->ops->solvetransposeadd = NULL;
1764   B->ops->matsolve          = NULL;
1765   B->assembled              = PETSC_TRUE;
1766   B->preallocated           = PETSC_TRUE;
1767   PetscFunctionReturn(PETSC_SUCCESS);
1768 }
1769