xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision d2522c19e8fa9bca20aaca277941d9a63e71db6a)
1 
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include <../src/mat/impls/sbaij/seq/sbaij.h>
7 #include <../src/mat/impls/baij/seq/baij.h>
8 #include <petsc/private/kernels/blockinvert.h>
9 
10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) {
11   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
12   IS                 isrow = a->row;
13   PetscInt           mbs = a->mbs, *ai = a->i, *aj = a->j;
14   const PetscInt    *r;
15   PetscInt           nz, *vj, k, idx, k1;
16   PetscInt           bs = A->rmap->bs, bs2 = a->bs2;
17   const MatScalar   *aa = a->a, *v, *diag;
18   PetscScalar       *x, *xk, *xj, *xk_tmp, *t;
19   const PetscScalar *b;
20 
21   PetscFunctionBegin;
22   PetscCall(VecGetArrayRead(bb, &b));
23   PetscCall(VecGetArray(xx, &x));
24   t = a->solve_work;
25   PetscCall(ISGetIndices(isrow, &r));
26   PetscCall(PetscMalloc1(bs, &xk_tmp));
27 
28   /* solve U^T * D * y = b by forward substitution */
29   xk = t;
30   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
31     idx = bs * r[k];
32     for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1];
33   }
34   for (k = 0; k < mbs; k++) {
35     v  = aa + bs2 * ai[k];
36     xk = t + k * bs;                          /* Dk*xk = k-th block of x */
37     PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
38     nz = ai[k + 1] - ai[k];
39     vj = aj + ai[k];
40     xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */
41     while (nz--) {
42       /* x(:) += U(k,:)^T*(Dk*xk) */
43       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
44       vj++;
45       xj = t + (*vj) * bs;
46       v += bs2;
47     }
48     /* xk = inv(Dk)*(Dk*xk) */
49     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
50     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
51   }
52 
53   /* solve U*x = y by back substitution */
54   for (k = mbs - 1; k >= 0; k--) {
55     v  = aa + bs2 * ai[k];
56     xk = t + k * bs; /* xk */
57     nz = ai[k + 1] - ai[k];
58     vj = aj + ai[k];
59     xj = t + (*vj) * bs;
60     while (nz--) {
61       /* xk += U(k,:)*x(:) */
62       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
63       vj++;
64       v += bs2;
65       xj = t + (*vj) * bs;
66     }
67     idx = bs * r[k];
68     for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++;
69   }
70 
71   PetscCall(PetscFree(xk_tmp));
72   PetscCall(ISRestoreIndices(isrow, &r));
73   PetscCall(VecRestoreArrayRead(bb, &b));
74   PetscCall(VecRestoreArray(xx, &x));
75   PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs));
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) {
80   PetscFunctionBegin;
81   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet");
82 }
83 
84 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx) {
85   PetscFunctionBegin;
86   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented");
87 }
88 
89 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x) {
90   PetscInt         nz, k;
91   const PetscInt  *vj, bs2 = bs * bs;
92   const MatScalar *v, *diag;
93   PetscScalar     *xk, *xj, *xk_tmp;
94 
95   PetscFunctionBegin;
96   PetscCall(PetscMalloc1(bs, &xk_tmp));
97   for (k = 0; k < mbs; k++) {
98     v  = aa + bs2 * ai[k];
99     xk = x + k * bs;                          /* Dk*xk = k-th block of x */
100     PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
101     nz = ai[k + 1] - ai[k];
102     vj = aj + ai[k];
103     xj = x + (*vj) * bs; /* *vj-th block of x, *vj>k */
104     while (nz--) {
105       /* x(:) += U(k,:)^T*(Dk*xk) */
106       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
107       vj++;
108       xj = x + (*vj) * bs;
109       v += bs2;
110     }
111     /* xk = inv(Dk)*(Dk*xk) */
112     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
113     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
114   }
115   PetscCall(PetscFree(xk_tmp));
116   PetscFunctionReturn(0);
117 }
118 
119 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x) {
120   PetscInt         nz, k;
121   const PetscInt  *vj, bs2 = bs * bs;
122   const MatScalar *v;
123   PetscScalar     *xk, *xj;
124 
125   PetscFunctionBegin;
126   for (k = mbs - 1; k >= 0; k--) {
127     v  = aa + bs2 * ai[k];
128     xk = x + k * bs; /* xk */
129     nz = ai[k + 1] - ai[k];
130     vj = aj + ai[k];
131     xj = x + (*vj) * bs;
132     while (nz--) {
133       /* xk += U(k,:)*x(:) */
134       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
135       vj++;
136       v += bs2;
137       xj = x + (*vj) * bs;
138     }
139   }
140   PetscFunctionReturn(0);
141 }
142 
143 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
144   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
145   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
146   PetscInt           bs = A->rmap->bs;
147   const MatScalar   *aa = a->a;
148   PetscScalar       *x;
149   const PetscScalar *b;
150 
151   PetscFunctionBegin;
152   PetscCall(VecGetArrayRead(bb, &b));
153   PetscCall(VecGetArray(xx, &x));
154 
155   /* solve U^T * D * y = b by forward substitution */
156   PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
157   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
158 
159   /* solve U*x = y by back substitution */
160   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
161 
162   PetscCall(VecRestoreArrayRead(bb, &b));
163   PetscCall(VecRestoreArray(xx, &x));
164   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs));
165   PetscFunctionReturn(0);
166 }
167 
168 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
169   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
170   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
171   PetscInt           bs = A->rmap->bs;
172   const MatScalar   *aa = a->a;
173   const PetscScalar *b;
174   PetscScalar       *x;
175 
176   PetscFunctionBegin;
177   PetscCall(VecGetArrayRead(bb, &b));
178   PetscCall(VecGetArray(xx, &x));
179   PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
180   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
181   PetscCall(VecRestoreArrayRead(bb, &b));
182   PetscCall(VecRestoreArray(xx, &x));
183   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs));
184   PetscFunctionReturn(0);
185 }
186 
187 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
188   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
189   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
190   PetscInt           bs = A->rmap->bs;
191   const MatScalar   *aa = a->a;
192   const PetscScalar *b;
193   PetscScalar       *x;
194 
195   PetscFunctionBegin;
196   PetscCall(VecGetArrayRead(bb, &b));
197   PetscCall(VecGetArray(xx, &x));
198   PetscCall(PetscArraycpy(x, b, bs * mbs));
199   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
200   PetscCall(VecRestoreArrayRead(bb, &b));
201   PetscCall(VecRestoreArray(xx, &x));
202   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
203   PetscFunctionReturn(0);
204 }
205 
206 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx) {
207   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
208   IS                 isrow = a->row;
209   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
210   PetscInt           nz, k, idx;
211   const MatScalar   *aa = a->a, *v, *d;
212   PetscScalar       *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp;
213   const PetscScalar *b;
214 
215   PetscFunctionBegin;
216   PetscCall(VecGetArrayRead(bb, &b));
217   PetscCall(VecGetArray(xx, &x));
218   t = a->solve_work;
219   PetscCall(ISGetIndices(isrow, &r));
220 
221   /* solve U^T * D * y = b by forward substitution */
222   tp = t;
223   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
224     idx   = 7 * r[k];
225     tp[0] = b[idx];
226     tp[1] = b[idx + 1];
227     tp[2] = b[idx + 2];
228     tp[3] = b[idx + 3];
229     tp[4] = b[idx + 4];
230     tp[5] = b[idx + 5];
231     tp[6] = b[idx + 6];
232     tp += 7;
233   }
234 
235   for (k = 0; k < mbs; k++) {
236     v  = aa + 49 * ai[k];
237     vj = aj + ai[k];
238     tp = t + k * 7;
239     x0 = tp[0];
240     x1 = tp[1];
241     x2 = tp[2];
242     x3 = tp[3];
243     x4 = tp[4];
244     x5 = tp[5];
245     x6 = tp[6];
246     nz = ai[k + 1] - ai[k];
247     tp = t + (*vj) * 7;
248     while (nz--) {
249       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
250       tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
251       tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
252       tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
253       tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
254       tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
255       tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
256       vj++;
257       tp = t + (*vj) * 7;
258       v += 49;
259     }
260 
261     /* xk = inv(Dk)*(Dk*xk) */
262     d     = aa + k * 49; /* ptr to inv(Dk) */
263     tp    = t + k * 7;
264     tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
265     tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
266     tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
267     tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
268     tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
269     tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
270     tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
271   }
272 
273   /* solve U*x = y by back substitution */
274   for (k = mbs - 1; k >= 0; k--) {
275     v  = aa + 49 * ai[k];
276     vj = aj + ai[k];
277     tp = t + k * 7;
278     x0 = tp[0];
279     x1 = tp[1];
280     x2 = tp[2];
281     x3 = tp[3];
282     x4 = tp[4];
283     x5 = tp[5];
284     x6 = tp[6]; /* xk */
285     nz = ai[k + 1] - ai[k];
286 
287     tp = t + (*vj) * 7;
288     while (nz--) {
289       /* xk += U(k,:)*x(:) */
290       x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6];
291       x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6];
292       x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6];
293       x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6];
294       x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6];
295       x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6];
296       x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6];
297       vj++;
298       tp = t + (*vj) * 7;
299       v += 49;
300     }
301     tp         = t + k * 7;
302     tp[0]      = x0;
303     tp[1]      = x1;
304     tp[2]      = x2;
305     tp[3]      = x3;
306     tp[4]      = x4;
307     tp[5]      = x5;
308     tp[6]      = x6;
309     idx        = 7 * r[k];
310     x[idx]     = x0;
311     x[idx + 1] = x1;
312     x[idx + 2] = x2;
313     x[idx + 3] = x3;
314     x[idx + 4] = x4;
315     x[idx + 5] = x5;
316     x[idx + 6] = x6;
317   }
318 
319   PetscCall(ISRestoreIndices(isrow, &r));
320   PetscCall(VecRestoreArrayRead(bb, &b));
321   PetscCall(VecRestoreArray(xx, &x));
322   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
323   PetscFunctionReturn(0);
324 }
325 
326 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) {
327   const MatScalar *v, *d;
328   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
329   PetscInt         nz, k;
330   const PetscInt  *vj;
331 
332   PetscFunctionBegin;
333   for (k = 0; k < mbs; k++) {
334     v  = aa + 49 * ai[k];
335     xp = x + k * 7;
336     x0 = xp[0];
337     x1 = xp[1];
338     x2 = xp[2];
339     x3 = xp[3];
340     x4 = xp[4];
341     x5 = xp[5];
342     x6 = xp[6]; /* Dk*xk = k-th block of x */
343     nz = ai[k + 1] - ai[k];
344     vj = aj + ai[k];
345     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
346     PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
347     while (nz--) {
348       xp = x + (*vj) * 7;
349       /* x(:) += U(k,:)^T*(Dk*xk) */
350       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
351       xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
352       xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
353       xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
354       xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
355       xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
356       xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
357       vj++;
358       v += 49;
359     }
360     /* xk = inv(Dk)*(Dk*xk) */
361     d     = aa + k * 49; /* ptr to inv(Dk) */
362     xp    = x + k * 7;
363     xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
364     xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
365     xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
366     xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
367     xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
368     xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
369     xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
370   }
371   PetscFunctionReturn(0);
372 }
373 
374 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) {
375   const MatScalar *v;
376   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
377   PetscInt         nz, k;
378   const PetscInt  *vj;
379 
380   PetscFunctionBegin;
381   for (k = mbs - 1; k >= 0; k--) {
382     v  = aa + 49 * ai[k];
383     xp = x + k * 7;
384     x0 = xp[0];
385     x1 = xp[1];
386     x2 = xp[2];
387     x3 = xp[3];
388     x4 = xp[4];
389     x5 = xp[5];
390     x6 = xp[6]; /* xk */
391     nz = ai[k + 1] - ai[k];
392     vj = aj + ai[k];
393     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
394     PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
395     while (nz--) {
396       xp = x + (*vj) * 7;
397       /* xk += U(k,:)*x(:) */
398       x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6];
399       x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6];
400       x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6];
401       x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6];
402       x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6];
403       x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6];
404       x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6];
405       vj++;
406       v += 49;
407     }
408     xp    = x + k * 7;
409     xp[0] = x0;
410     xp[1] = x1;
411     xp[2] = x2;
412     xp[3] = x3;
413     xp[4] = x4;
414     xp[5] = x5;
415     xp[6] = x6;
416   }
417   PetscFunctionReturn(0);
418 }
419 
420 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
421   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
422   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
423   const MatScalar   *aa = a->a;
424   PetscScalar       *x;
425   const PetscScalar *b;
426 
427   PetscFunctionBegin;
428   PetscCall(VecGetArrayRead(bb, &b));
429   PetscCall(VecGetArray(xx, &x));
430 
431   /* solve U^T * D * y = b by forward substitution */
432   PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */
433   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
434 
435   /* solve U*x = y by back substitution */
436   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
437 
438   PetscCall(VecRestoreArrayRead(bb, &b));
439   PetscCall(VecRestoreArray(xx, &x));
440   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
441   PetscFunctionReturn(0);
442 }
443 
444 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
445   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
446   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
447   const MatScalar   *aa = a->a;
448   PetscScalar       *x;
449   const PetscScalar *b;
450 
451   PetscFunctionBegin;
452   PetscCall(VecGetArrayRead(bb, &b));
453   PetscCall(VecGetArray(xx, &x));
454   PetscCall(PetscArraycpy(x, b, 7 * mbs));
455   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
456   PetscCall(VecRestoreArrayRead(bb, &b));
457   PetscCall(VecRestoreArray(xx, &x));
458   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
459   PetscFunctionReturn(0);
460 }
461 
462 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
463   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
464   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
465   const MatScalar   *aa = a->a;
466   PetscScalar       *x;
467   const PetscScalar *b;
468 
469   PetscFunctionBegin;
470   PetscCall(VecGetArrayRead(bb, &b));
471   PetscCall(VecGetArray(xx, &x));
472   PetscCall(PetscArraycpy(x, b, 7 * mbs));
473   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
474   PetscCall(VecRestoreArrayRead(bb, &b));
475   PetscCall(VecRestoreArray(xx, &x));
476   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
477   PetscFunctionReturn(0);
478 }
479 
480 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx) {
481   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
482   IS                 isrow = a->row;
483   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
484   PetscInt           nz, k, idx;
485   const MatScalar   *aa = a->a, *v, *d;
486   PetscScalar       *x, x0, x1, x2, x3, x4, x5, *t, *tp;
487   const PetscScalar *b;
488 
489   PetscFunctionBegin;
490   PetscCall(VecGetArrayRead(bb, &b));
491   PetscCall(VecGetArray(xx, &x));
492   t = a->solve_work;
493   PetscCall(ISGetIndices(isrow, &r));
494 
495   /* solve U^T * D * y = b by forward substitution */
496   tp = t;
497   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
498     idx   = 6 * r[k];
499     tp[0] = b[idx];
500     tp[1] = b[idx + 1];
501     tp[2] = b[idx + 2];
502     tp[3] = b[idx + 3];
503     tp[4] = b[idx + 4];
504     tp[5] = b[idx + 5];
505     tp += 6;
506   }
507 
508   for (k = 0; k < mbs; k++) {
509     v  = aa + 36 * ai[k];
510     vj = aj + ai[k];
511     tp = t + k * 6;
512     x0 = tp[0];
513     x1 = tp[1];
514     x2 = tp[2];
515     x3 = tp[3];
516     x4 = tp[4];
517     x5 = tp[5];
518     nz = ai[k + 1] - ai[k];
519     tp = t + (*vj) * 6;
520     while (nz--) {
521       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
522       tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
523       tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
524       tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
525       tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
526       tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
527       vj++;
528       tp = t + (*vj) * 6;
529       v += 36;
530     }
531 
532     /* xk = inv(Dk)*(Dk*xk) */
533     d     = aa + k * 36; /* ptr to inv(Dk) */
534     tp    = t + k * 6;
535     tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
536     tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
537     tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
538     tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
539     tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
540     tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
541   }
542 
543   /* solve U*x = y by back substitution */
544   for (k = mbs - 1; k >= 0; k--) {
545     v  = aa + 36 * ai[k];
546     vj = aj + ai[k];
547     tp = t + k * 6;
548     x0 = tp[0];
549     x1 = tp[1];
550     x2 = tp[2];
551     x3 = tp[3];
552     x4 = tp[4];
553     x5 = tp[5]; /* xk */
554     nz = ai[k + 1] - ai[k];
555 
556     tp = t + (*vj) * 6;
557     while (nz--) {
558       /* xk += U(k,:)*x(:) */
559       x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5];
560       x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5];
561       x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5];
562       x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5];
563       x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5];
564       x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5];
565       vj++;
566       tp = t + (*vj) * 6;
567       v += 36;
568     }
569     tp         = t + k * 6;
570     tp[0]      = x0;
571     tp[1]      = x1;
572     tp[2]      = x2;
573     tp[3]      = x3;
574     tp[4]      = x4;
575     tp[5]      = x5;
576     idx        = 6 * r[k];
577     x[idx]     = x0;
578     x[idx + 1] = x1;
579     x[idx + 2] = x2;
580     x[idx + 3] = x3;
581     x[idx + 4] = x4;
582     x[idx + 5] = x5;
583   }
584 
585   PetscCall(ISRestoreIndices(isrow, &r));
586   PetscCall(VecRestoreArrayRead(bb, &b));
587   PetscCall(VecRestoreArray(xx, &x));
588   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
589   PetscFunctionReturn(0);
590 }
591 
592 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) {
593   const MatScalar *v, *d;
594   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
595   PetscInt         nz, k;
596   const PetscInt  *vj;
597 
598   PetscFunctionBegin;
599   for (k = 0; k < mbs; k++) {
600     v  = aa + 36 * ai[k];
601     xp = x + k * 6;
602     x0 = xp[0];
603     x1 = xp[1];
604     x2 = xp[2];
605     x3 = xp[3];
606     x4 = xp[4];
607     x5 = xp[5]; /* Dk*xk = k-th block of x */
608     nz = ai[k + 1] - ai[k];
609     vj = aj + ai[k];
610     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
611     PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
612     while (nz--) {
613       xp = x + (*vj) * 6;
614       /* x(:) += U(k,:)^T*(Dk*xk) */
615       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
616       xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
617       xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
618       xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
619       xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
620       xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
621       vj++;
622       v += 36;
623     }
624     /* xk = inv(Dk)*(Dk*xk) */
625     d     = aa + k * 36; /* ptr to inv(Dk) */
626     xp    = x + k * 6;
627     xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
628     xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
629     xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
630     xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
631     xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
632     xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
633   }
634   PetscFunctionReturn(0);
635 }
636 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) {
637   const MatScalar *v;
638   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
639   PetscInt         nz, k;
640   const PetscInt  *vj;
641 
642   PetscFunctionBegin;
643   for (k = mbs - 1; k >= 0; k--) {
644     v  = aa + 36 * ai[k];
645     xp = x + k * 6;
646     x0 = xp[0];
647     x1 = xp[1];
648     x2 = xp[2];
649     x3 = xp[3];
650     x4 = xp[4];
651     x5 = xp[5]; /* xk */
652     nz = ai[k + 1] - ai[k];
653     vj = aj + ai[k];
654     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
655     PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
656     while (nz--) {
657       xp = x + (*vj) * 6;
658       /* xk += U(k,:)*x(:) */
659       x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5];
660       x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5];
661       x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5];
662       x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5];
663       x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5];
664       x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5];
665       vj++;
666       v += 36;
667     }
668     xp    = x + k * 6;
669     xp[0] = x0;
670     xp[1] = x1;
671     xp[2] = x2;
672     xp[3] = x3;
673     xp[4] = x4;
674     xp[5] = x5;
675   }
676   PetscFunctionReturn(0);
677 }
678 
679 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
680   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
681   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
682   const MatScalar   *aa = a->a;
683   PetscScalar       *x;
684   const PetscScalar *b;
685 
686   PetscFunctionBegin;
687   PetscCall(VecGetArrayRead(bb, &b));
688   PetscCall(VecGetArray(xx, &x));
689 
690   /* solve U^T * D * y = b by forward substitution */
691   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
692   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
693 
694   /* solve U*x = y by back substitution */
695   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
696 
697   PetscCall(VecRestoreArrayRead(bb, &b));
698   PetscCall(VecRestoreArray(xx, &x));
699   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
700   PetscFunctionReturn(0);
701 }
702 
703 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
704   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
705   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
706   const MatScalar   *aa = a->a;
707   PetscScalar       *x;
708   const PetscScalar *b;
709 
710   PetscFunctionBegin;
711   PetscCall(VecGetArrayRead(bb, &b));
712   PetscCall(VecGetArray(xx, &x));
713   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
714   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
715   PetscCall(VecRestoreArrayRead(bb, &b));
716   PetscCall(VecRestoreArray(xx, &x));
717   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
718   PetscFunctionReturn(0);
719 }
720 
721 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
722   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
723   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
724   const MatScalar   *aa = a->a;
725   PetscScalar       *x;
726   const PetscScalar *b;
727 
728   PetscFunctionBegin;
729   PetscCall(VecGetArrayRead(bb, &b));
730   PetscCall(VecGetArray(xx, &x));
731   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
732   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
733   PetscCall(VecRestoreArrayRead(bb, &b));
734   PetscCall(VecRestoreArray(xx, &x));
735   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
736   PetscFunctionReturn(0);
737 }
738 
739 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx) {
740   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
741   IS                 isrow = a->row;
742   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
743   const PetscInt    *r, *vj;
744   PetscInt           nz, k, idx;
745   const MatScalar   *aa = a->a, *v, *diag;
746   PetscScalar       *x, x0, x1, x2, x3, x4, *t, *tp;
747   const PetscScalar *b;
748 
749   PetscFunctionBegin;
750   PetscCall(VecGetArrayRead(bb, &b));
751   PetscCall(VecGetArray(xx, &x));
752   t = a->solve_work;
753   PetscCall(ISGetIndices(isrow, &r));
754 
755   /* solve U^T * D * y = b by forward substitution */
756   tp = t;
757   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
758     idx   = 5 * r[k];
759     tp[0] = b[idx];
760     tp[1] = b[idx + 1];
761     tp[2] = b[idx + 2];
762     tp[3] = b[idx + 3];
763     tp[4] = b[idx + 4];
764     tp += 5;
765   }
766 
767   for (k = 0; k < mbs; k++) {
768     v  = aa + 25 * ai[k];
769     vj = aj + ai[k];
770     tp = t + k * 5;
771     x0 = tp[0];
772     x1 = tp[1];
773     x2 = tp[2];
774     x3 = tp[3];
775     x4 = tp[4];
776     nz = ai[k + 1] - ai[k];
777 
778     tp = t + (*vj) * 5;
779     while (nz--) {
780       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
781       tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
782       tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
783       tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
784       tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
785       vj++;
786       tp = t + (*vj) * 5;
787       v += 25;
788     }
789 
790     /* xk = inv(Dk)*(Dk*xk) */
791     diag  = aa + k * 25; /* ptr to inv(Dk) */
792     tp    = t + k * 5;
793     tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
794     tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
795     tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
796     tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
797     tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
798   }
799 
800   /* solve U*x = y by back substitution */
801   for (k = mbs - 1; k >= 0; k--) {
802     v  = aa + 25 * ai[k];
803     vj = aj + ai[k];
804     tp = t + k * 5;
805     x0 = tp[0];
806     x1 = tp[1];
807     x2 = tp[2];
808     x3 = tp[3];
809     x4 = tp[4]; /* xk */
810     nz = ai[k + 1] - ai[k];
811 
812     tp = t + (*vj) * 5;
813     while (nz--) {
814       /* xk += U(k,:)*x(:) */
815       x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4];
816       x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4];
817       x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4];
818       x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4];
819       x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4];
820       vj++;
821       tp = t + (*vj) * 5;
822       v += 25;
823     }
824     tp         = t + k * 5;
825     tp[0]      = x0;
826     tp[1]      = x1;
827     tp[2]      = x2;
828     tp[3]      = x3;
829     tp[4]      = x4;
830     idx        = 5 * r[k];
831     x[idx]     = x0;
832     x[idx + 1] = x1;
833     x[idx + 2] = x2;
834     x[idx + 3] = x3;
835     x[idx + 4] = x4;
836   }
837 
838   PetscCall(ISRestoreIndices(isrow, &r));
839   PetscCall(VecRestoreArrayRead(bb, &b));
840   PetscCall(VecRestoreArray(xx, &x));
841   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
842   PetscFunctionReturn(0);
843 }
844 
845 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) {
846   const MatScalar *v, *diag;
847   PetscScalar     *xp, x0, x1, x2, x3, x4;
848   PetscInt         nz, k;
849   const PetscInt  *vj;
850 
851   PetscFunctionBegin;
852   for (k = 0; k < mbs; k++) {
853     v  = aa + 25 * ai[k];
854     xp = x + k * 5;
855     x0 = xp[0];
856     x1 = xp[1];
857     x2 = xp[2];
858     x3 = xp[3];
859     x4 = xp[4]; /* Dk*xk = k-th block of x */
860     nz = ai[k + 1] - ai[k];
861     vj = aj + ai[k];
862     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
863     PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
864     while (nz--) {
865       xp = x + (*vj) * 5;
866       /* x(:) += U(k,:)^T*(Dk*xk) */
867       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
868       xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
869       xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
870       xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
871       xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
872       vj++;
873       v += 25;
874     }
875     /* xk = inv(Dk)*(Dk*xk) */
876     diag  = aa + k * 25; /* ptr to inv(Dk) */
877     xp    = x + k * 5;
878     xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
879     xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
880     xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
881     xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
882     xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
883   }
884   PetscFunctionReturn(0);
885 }
886 
887 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) {
888   const MatScalar *v;
889   PetscScalar     *xp, x0, x1, x2, x3, x4;
890   PetscInt         nz, k;
891   const PetscInt  *vj;
892 
893   PetscFunctionBegin;
894   for (k = mbs - 1; k >= 0; k--) {
895     v  = aa + 25 * ai[k];
896     xp = x + k * 5;
897     x0 = xp[0];
898     x1 = xp[1];
899     x2 = xp[2];
900     x3 = xp[3];
901     x4 = xp[4]; /* xk */
902     nz = ai[k + 1] - ai[k];
903     vj = aj + ai[k];
904     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
905     PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
906     while (nz--) {
907       xp = x + (*vj) * 5;
908       /* xk += U(k,:)*x(:) */
909       x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4];
910       x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4];
911       x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4];
912       x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4];
913       x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4];
914       vj++;
915       v += 25;
916     }
917     xp    = x + k * 5;
918     xp[0] = x0;
919     xp[1] = x1;
920     xp[2] = x2;
921     xp[3] = x3;
922     xp[4] = x4;
923   }
924   PetscFunctionReturn(0);
925 }
926 
927 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
928   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
929   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
930   const MatScalar   *aa = a->a;
931   PetscScalar       *x;
932   const PetscScalar *b;
933 
934   PetscFunctionBegin;
935   PetscCall(VecGetArrayRead(bb, &b));
936   PetscCall(VecGetArray(xx, &x));
937 
938   /* solve U^T * D * y = b by forward substitution */
939   PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
940   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
941 
942   /* solve U*x = y by back substitution */
943   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
944 
945   PetscCall(VecRestoreArrayRead(bb, &b));
946   PetscCall(VecRestoreArray(xx, &x));
947   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
948   PetscFunctionReturn(0);
949 }
950 
951 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
952   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
953   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
954   const MatScalar   *aa = a->a;
955   PetscScalar       *x;
956   const PetscScalar *b;
957 
958   PetscFunctionBegin;
959   PetscCall(VecGetArrayRead(bb, &b));
960   PetscCall(VecGetArray(xx, &x));
961   PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
962   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
963   PetscCall(VecRestoreArrayRead(bb, &b));
964   PetscCall(VecRestoreArray(xx, &x));
965   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
966   PetscFunctionReturn(0);
967 }
968 
969 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
970   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
971   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
972   const MatScalar   *aa = a->a;
973   PetscScalar       *x;
974   const PetscScalar *b;
975 
976   PetscFunctionBegin;
977   PetscCall(VecGetArrayRead(bb, &b));
978   PetscCall(VecGetArray(xx, &x));
979   PetscCall(PetscArraycpy(x, b, 5 * mbs));
980   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
981   PetscCall(VecRestoreArrayRead(bb, &b));
982   PetscCall(VecRestoreArray(xx, &x));
983   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
984   PetscFunctionReturn(0);
985 }
986 
987 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx) {
988   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
989   IS                 isrow = a->row;
990   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
991   const PetscInt    *r, *vj;
992   PetscInt           nz, k, idx;
993   const MatScalar   *aa = a->a, *v, *diag;
994   PetscScalar       *x, x0, x1, x2, x3, *t, *tp;
995   const PetscScalar *b;
996 
997   PetscFunctionBegin;
998   PetscCall(VecGetArrayRead(bb, &b));
999   PetscCall(VecGetArray(xx, &x));
1000   t = a->solve_work;
1001   PetscCall(ISGetIndices(isrow, &r));
1002 
1003   /* solve U^T * D * y = b by forward substitution */
1004   tp = t;
1005   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1006     idx   = 4 * r[k];
1007     tp[0] = b[idx];
1008     tp[1] = b[idx + 1];
1009     tp[2] = b[idx + 2];
1010     tp[3] = b[idx + 3];
1011     tp += 4;
1012   }
1013 
1014   for (k = 0; k < mbs; k++) {
1015     v  = aa + 16 * ai[k];
1016     vj = aj + ai[k];
1017     tp = t + k * 4;
1018     x0 = tp[0];
1019     x1 = tp[1];
1020     x2 = tp[2];
1021     x3 = tp[3];
1022     nz = ai[k + 1] - ai[k];
1023 
1024     tp = t + (*vj) * 4;
1025     while (nz--) {
1026       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1027       tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1028       tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1029       tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1030       vj++;
1031       tp = t + (*vj) * 4;
1032       v += 16;
1033     }
1034 
1035     /* xk = inv(Dk)*(Dk*xk) */
1036     diag  = aa + k * 16; /* ptr to inv(Dk) */
1037     tp    = t + k * 4;
1038     tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1039     tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1040     tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1041     tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1042   }
1043 
1044   /* solve U*x = y by back substitution */
1045   for (k = mbs - 1; k >= 0; k--) {
1046     v  = aa + 16 * ai[k];
1047     vj = aj + ai[k];
1048     tp = t + k * 4;
1049     x0 = tp[0];
1050     x1 = tp[1];
1051     x2 = tp[2];
1052     x3 = tp[3]; /* xk */
1053     nz = ai[k + 1] - ai[k];
1054 
1055     tp = t + (*vj) * 4;
1056     while (nz--) {
1057       /* xk += U(k,:)*x(:) */
1058       x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3];
1059       x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3];
1060       x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3];
1061       x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3];
1062       vj++;
1063       tp = t + (*vj) * 4;
1064       v += 16;
1065     }
1066     tp         = t + k * 4;
1067     tp[0]      = x0;
1068     tp[1]      = x1;
1069     tp[2]      = x2;
1070     tp[3]      = x3;
1071     idx        = 4 * r[k];
1072     x[idx]     = x0;
1073     x[idx + 1] = x1;
1074     x[idx + 2] = x2;
1075     x[idx + 3] = x3;
1076   }
1077 
1078   PetscCall(ISRestoreIndices(isrow, &r));
1079   PetscCall(VecRestoreArrayRead(bb, &b));
1080   PetscCall(VecRestoreArray(xx, &x));
1081   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1082   PetscFunctionReturn(0);
1083 }
1084 
1085 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) {
1086   const MatScalar *v, *diag;
1087   PetscScalar     *xp, x0, x1, x2, x3;
1088   PetscInt         nz, k;
1089   const PetscInt  *vj;
1090 
1091   PetscFunctionBegin;
1092   for (k = 0; k < mbs; k++) {
1093     v  = aa + 16 * ai[k];
1094     xp = x + k * 4;
1095     x0 = xp[0];
1096     x1 = xp[1];
1097     x2 = xp[2];
1098     x3 = xp[3]; /* Dk*xk = k-th block of x */
1099     nz = ai[k + 1] - ai[k];
1100     vj = aj + ai[k];
1101     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1102     PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1103     while (nz--) {
1104       xp = x + (*vj) * 4;
1105       /* x(:) += U(k,:)^T*(Dk*xk) */
1106       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1107       xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1108       xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1109       xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1110       vj++;
1111       v += 16;
1112     }
1113     /* xk = inv(Dk)*(Dk*xk) */
1114     diag  = aa + k * 16; /* ptr to inv(Dk) */
1115     xp    = x + k * 4;
1116     xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1117     xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1118     xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1119     xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1120   }
1121   PetscFunctionReturn(0);
1122 }
1123 
1124 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) {
1125   const MatScalar *v;
1126   PetscScalar     *xp, x0, x1, x2, x3;
1127   PetscInt         nz, k;
1128   const PetscInt  *vj;
1129 
1130   PetscFunctionBegin;
1131   for (k = mbs - 1; k >= 0; k--) {
1132     v  = aa + 16 * ai[k];
1133     xp = x + k * 4;
1134     x0 = xp[0];
1135     x1 = xp[1];
1136     x2 = xp[2];
1137     x3 = xp[3]; /* xk */
1138     nz = ai[k + 1] - ai[k];
1139     vj = aj + ai[k];
1140     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1141     PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1142     while (nz--) {
1143       xp = x + (*vj) * 4;
1144       /* xk += U(k,:)*x(:) */
1145       x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3];
1146       x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3];
1147       x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3];
1148       x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3];
1149       vj++;
1150       v += 16;
1151     }
1152     xp    = x + k * 4;
1153     xp[0] = x0;
1154     xp[1] = x1;
1155     xp[2] = x2;
1156     xp[3] = x3;
1157   }
1158   PetscFunctionReturn(0);
1159 }
1160 
1161 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
1162   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1163   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1164   const MatScalar   *aa = a->a;
1165   PetscScalar       *x;
1166   const PetscScalar *b;
1167 
1168   PetscFunctionBegin;
1169   PetscCall(VecGetArrayRead(bb, &b));
1170   PetscCall(VecGetArray(xx, &x));
1171 
1172   /* solve U^T * D * y = b by forward substitution */
1173   PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1174   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1175 
1176   /* solve U*x = y by back substitution */
1177   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1178   PetscCall(VecRestoreArrayRead(bb, &b));
1179   PetscCall(VecRestoreArray(xx, &x));
1180   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1181   PetscFunctionReturn(0);
1182 }
1183 
1184 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
1185   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1186   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1187   const MatScalar   *aa = a->a;
1188   PetscScalar       *x;
1189   const PetscScalar *b;
1190 
1191   PetscFunctionBegin;
1192   PetscCall(VecGetArrayRead(bb, &b));
1193   PetscCall(VecGetArray(xx, &x));
1194   PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1195   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1196   PetscCall(VecRestoreArrayRead(bb, &b));
1197   PetscCall(VecRestoreArray(xx, &x));
1198   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1199   PetscFunctionReturn(0);
1200 }
1201 
1202 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
1203   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1204   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1205   const MatScalar   *aa = a->a;
1206   PetscScalar       *x;
1207   const PetscScalar *b;
1208 
1209   PetscFunctionBegin;
1210   PetscCall(VecGetArrayRead(bb, &b));
1211   PetscCall(VecGetArray(xx, &x));
1212   PetscCall(PetscArraycpy(x, b, 4 * mbs));
1213   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1214   PetscCall(VecRestoreArrayRead(bb, &b));
1215   PetscCall(VecRestoreArray(xx, &x));
1216   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1217   PetscFunctionReturn(0);
1218 }
1219 
1220 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx) {
1221   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1222   IS                 isrow = a->row;
1223   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1224   const PetscInt    *r;
1225   PetscInt           nz, k, idx;
1226   const PetscInt    *vj;
1227   const MatScalar   *aa = a->a, *v, *diag;
1228   PetscScalar       *x, x0, x1, x2, *t, *tp;
1229   const PetscScalar *b;
1230 
1231   PetscFunctionBegin;
1232   PetscCall(VecGetArrayRead(bb, &b));
1233   PetscCall(VecGetArray(xx, &x));
1234   t = a->solve_work;
1235   PetscCall(ISGetIndices(isrow, &r));
1236 
1237   /* solve U^T * D * y = b by forward substitution */
1238   tp = t;
1239   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1240     idx   = 3 * r[k];
1241     tp[0] = b[idx];
1242     tp[1] = b[idx + 1];
1243     tp[2] = b[idx + 2];
1244     tp += 3;
1245   }
1246 
1247   for (k = 0; k < mbs; k++) {
1248     v  = aa + 9 * ai[k];
1249     vj = aj + ai[k];
1250     tp = t + k * 3;
1251     x0 = tp[0];
1252     x1 = tp[1];
1253     x2 = tp[2];
1254     nz = ai[k + 1] - ai[k];
1255 
1256     tp = t + (*vj) * 3;
1257     while (nz--) {
1258       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1259       tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1260       tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1261       vj++;
1262       tp = t + (*vj) * 3;
1263       v += 9;
1264     }
1265 
1266     /* xk = inv(Dk)*(Dk*xk) */
1267     diag  = aa + k * 9; /* ptr to inv(Dk) */
1268     tp    = t + k * 3;
1269     tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1270     tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1271     tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1272   }
1273 
1274   /* solve U*x = y by back substitution */
1275   for (k = mbs - 1; k >= 0; k--) {
1276     v  = aa + 9 * ai[k];
1277     vj = aj + ai[k];
1278     tp = t + k * 3;
1279     x0 = tp[0];
1280     x1 = tp[1];
1281     x2 = tp[2]; /* xk */
1282     nz = ai[k + 1] - ai[k];
1283 
1284     tp = t + (*vj) * 3;
1285     while (nz--) {
1286       /* xk += U(k,:)*x(:) */
1287       x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2];
1288       x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2];
1289       x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2];
1290       vj++;
1291       tp = t + (*vj) * 3;
1292       v += 9;
1293     }
1294     tp         = t + k * 3;
1295     tp[0]      = x0;
1296     tp[1]      = x1;
1297     tp[2]      = x2;
1298     idx        = 3 * r[k];
1299     x[idx]     = x0;
1300     x[idx + 1] = x1;
1301     x[idx + 2] = x2;
1302   }
1303 
1304   PetscCall(ISRestoreIndices(isrow, &r));
1305   PetscCall(VecRestoreArrayRead(bb, &b));
1306   PetscCall(VecRestoreArray(xx, &x));
1307   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1308   PetscFunctionReturn(0);
1309 }
1310 
1311 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) {
1312   const MatScalar *v, *diag;
1313   PetscScalar     *xp, x0, x1, x2;
1314   PetscInt         nz, k;
1315   const PetscInt  *vj;
1316 
1317   PetscFunctionBegin;
1318   for (k = 0; k < mbs; k++) {
1319     v  = aa + 9 * ai[k];
1320     xp = x + k * 3;
1321     x0 = xp[0];
1322     x1 = xp[1];
1323     x2 = xp[2]; /* Dk*xk = k-th block of x */
1324     nz = ai[k + 1] - ai[k];
1325     vj = aj + ai[k];
1326     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1327     PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1328     while (nz--) {
1329       xp = x + (*vj) * 3;
1330       /* x(:) += U(k,:)^T*(Dk*xk) */
1331       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1332       xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1333       xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1334       vj++;
1335       v += 9;
1336     }
1337     /* xk = inv(Dk)*(Dk*xk) */
1338     diag  = aa + k * 9; /* ptr to inv(Dk) */
1339     xp    = x + k * 3;
1340     xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1341     xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1342     xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1343   }
1344   PetscFunctionReturn(0);
1345 }
1346 
1347 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) {
1348   const MatScalar *v;
1349   PetscScalar     *xp, x0, x1, x2;
1350   PetscInt         nz, k;
1351   const PetscInt  *vj;
1352 
1353   PetscFunctionBegin;
1354   for (k = mbs - 1; k >= 0; k--) {
1355     v  = aa + 9 * ai[k];
1356     xp = x + k * 3;
1357     x0 = xp[0];
1358     x1 = xp[1];
1359     x2 = xp[2]; /* xk */
1360     nz = ai[k + 1] - ai[k];
1361     vj = aj + ai[k];
1362     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1363     PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1364     while (nz--) {
1365       xp = x + (*vj) * 3;
1366       /* xk += U(k,:)*x(:) */
1367       x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2];
1368       x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2];
1369       x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2];
1370       vj++;
1371       v += 9;
1372     }
1373     xp    = x + k * 3;
1374     xp[0] = x0;
1375     xp[1] = x1;
1376     xp[2] = x2;
1377   }
1378   PetscFunctionReturn(0);
1379 }
1380 
1381 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
1382   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1383   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1384   const MatScalar   *aa = a->a;
1385   PetscScalar       *x;
1386   const PetscScalar *b;
1387 
1388   PetscFunctionBegin;
1389   PetscCall(VecGetArrayRead(bb, &b));
1390   PetscCall(VecGetArray(xx, &x));
1391 
1392   /* solve U^T * D * y = b by forward substitution */
1393   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1394   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1395 
1396   /* solve U*x = y by back substitution */
1397   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1398 
1399   PetscCall(VecRestoreArrayRead(bb, &b));
1400   PetscCall(VecRestoreArray(xx, &x));
1401   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1402   PetscFunctionReturn(0);
1403 }
1404 
1405 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
1406   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1407   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1408   const MatScalar   *aa = a->a;
1409   PetscScalar       *x;
1410   const PetscScalar *b;
1411 
1412   PetscFunctionBegin;
1413   PetscCall(VecGetArrayRead(bb, &b));
1414   PetscCall(VecGetArray(xx, &x));
1415   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1416   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1417   PetscCall(VecRestoreArrayRead(bb, &b));
1418   PetscCall(VecRestoreArray(xx, &x));
1419   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1420   PetscFunctionReturn(0);
1421 }
1422 
1423 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
1424   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1425   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1426   const MatScalar   *aa = a->a;
1427   PetscScalar       *x;
1428   const PetscScalar *b;
1429 
1430   PetscFunctionBegin;
1431   PetscCall(VecGetArrayRead(bb, &b));
1432   PetscCall(VecGetArray(xx, &x));
1433   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1434   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1435   PetscCall(VecRestoreArrayRead(bb, &b));
1436   PetscCall(VecRestoreArray(xx, &x));
1437   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1438   PetscFunctionReturn(0);
1439 }
1440 
1441 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx) {
1442   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1443   IS                 isrow = a->row;
1444   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1445   const PetscInt    *r, *vj;
1446   PetscInt           nz, k, k2, idx;
1447   const MatScalar   *aa = a->a, *v, *diag;
1448   PetscScalar       *x, x0, x1, *t;
1449   const PetscScalar *b;
1450 
1451   PetscFunctionBegin;
1452   PetscCall(VecGetArrayRead(bb, &b));
1453   PetscCall(VecGetArray(xx, &x));
1454   t = a->solve_work;
1455   PetscCall(ISGetIndices(isrow, &r));
1456 
1457   /* solve U^T * D * y = perm(b) by forward substitution */
1458   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1459     idx          = 2 * r[k];
1460     t[k * 2]     = b[idx];
1461     t[k * 2 + 1] = b[idx + 1];
1462   }
1463   for (k = 0; k < mbs; k++) {
1464     v  = aa + 4 * ai[k];
1465     vj = aj + ai[k];
1466     k2 = k * 2;
1467     x0 = t[k2];
1468     x1 = t[k2 + 1];
1469     nz = ai[k + 1] - ai[k];
1470     while (nz--) {
1471       t[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1472       t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1473       vj++;
1474       v += 4;
1475     }
1476     diag      = aa + k * 4; /* ptr to inv(Dk) */
1477     t[k2]     = diag[0] * x0 + diag[2] * x1;
1478     t[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1479   }
1480 
1481   /* solve U*x = y by back substitution */
1482   for (k = mbs - 1; k >= 0; k--) {
1483     v  = aa + 4 * ai[k];
1484     vj = aj + ai[k];
1485     k2 = k * 2;
1486     x0 = t[k2];
1487     x1 = t[k2 + 1];
1488     nz = ai[k + 1] - ai[k];
1489     while (nz--) {
1490       x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1];
1491       x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1];
1492       vj++;
1493       v += 4;
1494     }
1495     t[k2]      = x0;
1496     t[k2 + 1]  = x1;
1497     idx        = 2 * r[k];
1498     x[idx]     = x0;
1499     x[idx + 1] = x1;
1500   }
1501 
1502   PetscCall(ISRestoreIndices(isrow, &r));
1503   PetscCall(VecRestoreArrayRead(bb, &b));
1504   PetscCall(VecRestoreArray(xx, &x));
1505   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1506   PetscFunctionReturn(0);
1507 }
1508 
1509 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) {
1510   const MatScalar *v, *diag;
1511   PetscScalar      x0, x1;
1512   PetscInt         nz, k, k2;
1513   const PetscInt  *vj;
1514 
1515   PetscFunctionBegin;
1516   for (k = 0; k < mbs; k++) {
1517     v  = aa + 4 * ai[k];
1518     vj = aj + ai[k];
1519     k2 = k * 2;
1520     x0 = x[k2];
1521     x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */
1522     nz = ai[k + 1] - ai[k];
1523     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1524     PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1525     while (nz--) {
1526       /* x(:) += U(k,:)^T*(Dk*xk) */
1527       x[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1528       x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1529       vj++;
1530       v += 4;
1531     }
1532     /* xk = inv(Dk)*(Dk*xk) */
1533     diag      = aa + k * 4; /* ptr to inv(Dk) */
1534     x[k2]     = diag[0] * x0 + diag[2] * x1;
1535     x[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1536   }
1537   PetscFunctionReturn(0);
1538 }
1539 
1540 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x) {
1541   const MatScalar *v;
1542   PetscScalar      x0, x1;
1543   PetscInt         nz, k, k2;
1544   const PetscInt  *vj;
1545 
1546   PetscFunctionBegin;
1547   for (k = mbs - 1; k >= 0; k--) {
1548     v  = aa + 4 * ai[k];
1549     vj = aj + ai[k];
1550     k2 = k * 2;
1551     x0 = x[k2];
1552     x1 = x[k2 + 1]; /* xk */
1553     nz = ai[k + 1] - ai[k];
1554     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1555     PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1556     while (nz--) {
1557       /* xk += U(k,:)*x(:) */
1558       x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1];
1559       x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1];
1560       vj++;
1561       v += 4;
1562     }
1563     x[k2]     = x0;
1564     x[k2 + 1] = x1;
1565   }
1566   PetscFunctionReturn(0);
1567 }
1568 
1569 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
1570   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1571   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1572   const MatScalar   *aa = a->a;
1573   PetscScalar       *x;
1574   const PetscScalar *b;
1575 
1576   PetscFunctionBegin;
1577   PetscCall(VecGetArrayRead(bb, &b));
1578   PetscCall(VecGetArray(xx, &x));
1579 
1580   /* solve U^T * D * y = b by forward substitution */
1581   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1582   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1583 
1584   /* solve U*x = y by back substitution */
1585   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1586 
1587   PetscCall(VecRestoreArrayRead(bb, &b));
1588   PetscCall(VecRestoreArray(xx, &x));
1589   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1590   PetscFunctionReturn(0);
1591 }
1592 
1593 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
1594   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1595   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1596   const MatScalar   *aa = a->a;
1597   PetscScalar       *x;
1598   const PetscScalar *b;
1599 
1600   PetscFunctionBegin;
1601   PetscCall(VecGetArrayRead(bb, &b));
1602   PetscCall(VecGetArray(xx, &x));
1603   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1604   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1605   PetscCall(VecRestoreArrayRead(bb, &b));
1606   PetscCall(VecRestoreArray(xx, &x));
1607   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1608   PetscFunctionReturn(0);
1609 }
1610 
1611 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
1612   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1613   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1614   const MatScalar   *aa = a->a;
1615   PetscScalar       *x;
1616   const PetscScalar *b;
1617 
1618   PetscFunctionBegin;
1619   PetscCall(VecGetArrayRead(bb, &b));
1620   PetscCall(VecGetArray(xx, &x));
1621   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1622   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1623   PetscCall(VecRestoreArrayRead(bb, &b));
1624   PetscCall(VecRestoreArray(xx, &x));
1625   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1626   PetscFunctionReturn(0);
1627 }
1628 
1629 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) {
1630   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1631   IS                 isrow = a->row;
1632   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1633   const MatScalar   *aa = a->a, *v;
1634   const PetscScalar *b;
1635   PetscScalar       *x, xk, *t;
1636   PetscInt           nz, k, j;
1637 
1638   PetscFunctionBegin;
1639   PetscCall(VecGetArrayRead(bb, &b));
1640   PetscCall(VecGetArray(xx, &x));
1641   t = a->solve_work;
1642   PetscCall(ISGetIndices(isrow, &rp));
1643 
1644   /* solve U^T*D*y = perm(b) by forward substitution */
1645   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1646   for (k = 0; k < mbs; k++) {
1647     v  = aa + ai[k];
1648     vj = aj + ai[k];
1649     xk = t[k];
1650     nz = ai[k + 1] - ai[k] - 1;
1651     for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk;
1652     t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */
1653   }
1654 
1655   /* solve U*perm(x) = y by back substitution */
1656   for (k = mbs - 1; k >= 0; k--) {
1657     v  = aa + adiag[k] - 1;
1658     vj = aj + adiag[k] - 1;
1659     nz = ai[k + 1] - ai[k] - 1;
1660     for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]];
1661     x[rp[k]] = t[k];
1662   }
1663 
1664   PetscCall(ISRestoreIndices(isrow, &rp));
1665   PetscCall(VecRestoreArrayRead(bb, &b));
1666   PetscCall(VecRestoreArray(xx, &x));
1667   PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs));
1668   PetscFunctionReturn(0);
1669 }
1670 
1671 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) {
1672   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1673   IS                 isrow = a->row;
1674   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1675   const MatScalar   *aa = a->a, *v;
1676   PetscScalar       *x, xk, *t;
1677   const PetscScalar *b;
1678   PetscInt           nz, k;
1679 
1680   PetscFunctionBegin;
1681   PetscCall(VecGetArrayRead(bb, &b));
1682   PetscCall(VecGetArray(xx, &x));
1683   t = a->solve_work;
1684   PetscCall(ISGetIndices(isrow, &rp));
1685 
1686   /* solve U^T*D*y = perm(b) by forward substitution */
1687   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1688   for (k = 0; k < mbs; k++) {
1689     v  = aa + ai[k] + 1;
1690     vj = aj + ai[k] + 1;
1691     xk = t[k];
1692     nz = ai[k + 1] - ai[k] - 1;
1693     while (nz--) t[*vj++] += (*v++) * xk;
1694     t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */
1695   }
1696 
1697   /* solve U*perm(x) = y by back substitution */
1698   for (k = mbs - 1; k >= 0; k--) {
1699     v  = aa + ai[k] + 1;
1700     vj = aj + ai[k] + 1;
1701     nz = ai[k + 1] - ai[k] - 1;
1702     while (nz--) t[k] += (*v++) * t[*vj++];
1703     x[rp[k]] = t[k];
1704   }
1705 
1706   PetscCall(ISRestoreIndices(isrow, &rp));
1707   PetscCall(VecRestoreArrayRead(bb, &b));
1708   PetscCall(VecRestoreArray(xx, &x));
1709   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
1710   PetscFunctionReturn(0);
1711 }
1712 
1713 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) {
1714   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1715   IS                 isrow = a->row;
1716   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1717   const MatScalar   *aa = a->a, *v;
1718   PetscReal          diagk;
1719   PetscScalar       *x, xk;
1720   const PetscScalar *b;
1721   PetscInt           nz, k;
1722 
1723   PetscFunctionBegin;
1724   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1725   PetscCall(VecGetArrayRead(bb, &b));
1726   PetscCall(VecGetArray(xx, &x));
1727   PetscCall(ISGetIndices(isrow, &rp));
1728 
1729   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1730   for (k = 0; k < mbs; k++) {
1731     v  = aa + ai[k];
1732     vj = aj + ai[k];
1733     xk = x[k];
1734     nz = ai[k + 1] - ai[k] - 1;
1735     while (nz--) x[*vj++] += (*v++) * xk;
1736 
1737     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1738     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1739     x[k] = xk * PetscSqrtReal(diagk);
1740   }
1741   PetscCall(ISRestoreIndices(isrow, &rp));
1742   PetscCall(VecRestoreArrayRead(bb, &b));
1743   PetscCall(VecRestoreArray(xx, &x));
1744   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1745   PetscFunctionReturn(0);
1746 }
1747 
1748 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) {
1749   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1750   IS                 isrow = a->row;
1751   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1752   const MatScalar   *aa = a->a, *v;
1753   PetscReal          diagk;
1754   PetscScalar       *x, xk;
1755   const PetscScalar *b;
1756   PetscInt           nz, k;
1757 
1758   PetscFunctionBegin;
1759   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1760   PetscCall(VecGetArrayRead(bb, &b));
1761   PetscCall(VecGetArray(xx, &x));
1762   PetscCall(ISGetIndices(isrow, &rp));
1763 
1764   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1765   for (k = 0; k < mbs; k++) {
1766     v  = aa + ai[k] + 1;
1767     vj = aj + ai[k] + 1;
1768     xk = x[k];
1769     nz = ai[k + 1] - ai[k] - 1;
1770     while (nz--) x[*vj++] += (*v++) * xk;
1771 
1772     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1773     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1774     x[k] = xk * PetscSqrtReal(diagk);
1775   }
1776   PetscCall(ISRestoreIndices(isrow, &rp));
1777   PetscCall(VecRestoreArrayRead(bb, &b));
1778   PetscCall(VecRestoreArray(xx, &x));
1779   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1780   PetscFunctionReturn(0);
1781 }
1782 
1783 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx) {
1784   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1785   IS                 isrow = a->row;
1786   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1787   const MatScalar   *aa = a->a, *v;
1788   PetscReal          diagk;
1789   PetscScalar       *x, *t;
1790   const PetscScalar *b;
1791   PetscInt           nz, k;
1792 
1793   PetscFunctionBegin;
1794   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1795   PetscCall(VecGetArrayRead(bb, &b));
1796   PetscCall(VecGetArray(xx, &x));
1797   t = a->solve_work;
1798   PetscCall(ISGetIndices(isrow, &rp));
1799 
1800   for (k = mbs - 1; k >= 0; k--) {
1801     v     = aa + ai[k];
1802     vj    = aj + ai[k];
1803     diagk = PetscRealPart(aa[adiag[k]]);
1804     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1805     t[k] = b[k] * PetscSqrtReal(diagk);
1806     nz   = ai[k + 1] - ai[k] - 1;
1807     while (nz--) t[k] += (*v++) * t[*vj++];
1808     x[rp[k]] = t[k];
1809   }
1810   PetscCall(ISRestoreIndices(isrow, &rp));
1811   PetscCall(VecRestoreArrayRead(bb, &b));
1812   PetscCall(VecRestoreArray(xx, &x));
1813   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1814   PetscFunctionReturn(0);
1815 }
1816 
1817 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx) {
1818   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1819   IS                 isrow = a->row;
1820   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1821   const MatScalar   *aa = a->a, *v;
1822   PetscReal          diagk;
1823   PetscScalar       *x, *t;
1824   const PetscScalar *b;
1825   PetscInt           nz, k;
1826 
1827   PetscFunctionBegin;
1828   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1829   PetscCall(VecGetArrayRead(bb, &b));
1830   PetscCall(VecGetArray(xx, &x));
1831   t = a->solve_work;
1832   PetscCall(ISGetIndices(isrow, &rp));
1833 
1834   for (k = mbs - 1; k >= 0; k--) {
1835     v     = aa + ai[k] + 1;
1836     vj    = aj + ai[k] + 1;
1837     diagk = PetscRealPart(aa[ai[k]]);
1838     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1839     t[k] = b[k] * PetscSqrtReal(diagk);
1840     nz   = ai[k + 1] - ai[k] - 1;
1841     while (nz--) t[k] += (*v++) * t[*vj++];
1842     x[rp[k]] = t[k];
1843   }
1844   PetscCall(ISRestoreIndices(isrow, &rp));
1845   PetscCall(VecRestoreArrayRead(bb, &b));
1846   PetscCall(VecRestoreArray(xx, &x));
1847   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1848   PetscFunctionReturn(0);
1849 }
1850 
1851 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx) {
1852   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1853 
1854   PetscFunctionBegin;
1855   if (A->rmap->bs == 1) {
1856     PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v));
1857   } else {
1858     IS                 isrow = a->row;
1859     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1860     const MatScalar   *aa = a->a, *v;
1861     PetscScalar       *x, *t;
1862     const PetscScalar *b;
1863     PetscInt           nz, k, n, i, j;
1864 
1865     if (bb->n > a->solves_work_n) {
1866       PetscCall(PetscFree(a->solves_work));
1867       PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1868       a->solves_work_n = bb->n;
1869     }
1870     n = bb->n;
1871     PetscCall(VecGetArrayRead(bb->v, &b));
1872     PetscCall(VecGetArray(xx->v, &x));
1873     t = a->solves_work;
1874 
1875     PetscCall(ISGetIndices(isrow, &rp));
1876 
1877     /* solve U^T*D*y = perm(b) by forward substitution */
1878     for (k = 0; k < mbs; k++) {
1879       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1880     }
1881     for (k = 0; k < mbs; k++) {
1882       v  = aa + ai[k];
1883       vj = aj + ai[k];
1884       nz = ai[k + 1] - ai[k] - 1;
1885       for (j = 0; j < nz; j++) {
1886         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1887         v++;
1888         vj++;
1889       }
1890       for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1891     }
1892 
1893     /* solve U*perm(x) = y by back substitution */
1894     for (k = mbs - 1; k >= 0; k--) {
1895       v  = aa + ai[k] - 1;
1896       vj = aj + ai[k] - 1;
1897       nz = ai[k + 1] - ai[k] - 1;
1898       for (j = 0; j < nz; j++) {
1899         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1900         v++;
1901         vj++;
1902       }
1903       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1904     }
1905 
1906     PetscCall(ISRestoreIndices(isrow, &rp));
1907     PetscCall(VecRestoreArrayRead(bb->v, &b));
1908     PetscCall(VecRestoreArray(xx->v, &x));
1909     PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
1910   }
1911   PetscFunctionReturn(0);
1912 }
1913 
1914 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx) {
1915   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1916 
1917   PetscFunctionBegin;
1918   if (A->rmap->bs == 1) {
1919     PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v));
1920   } else {
1921     IS                 isrow = a->row;
1922     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1923     const MatScalar   *aa = a->a, *v;
1924     PetscScalar       *x, *t;
1925     const PetscScalar *b;
1926     PetscInt           nz, k, n, i;
1927 
1928     if (bb->n > a->solves_work_n) {
1929       PetscCall(PetscFree(a->solves_work));
1930       PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1931       a->solves_work_n = bb->n;
1932     }
1933     n = bb->n;
1934     PetscCall(VecGetArrayRead(bb->v, &b));
1935     PetscCall(VecGetArray(xx->v, &x));
1936     t = a->solves_work;
1937 
1938     PetscCall(ISGetIndices(isrow, &rp));
1939 
1940     /* solve U^T*D*y = perm(b) by forward substitution */
1941     for (k = 0; k < mbs; k++) {
1942       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1943     }
1944     for (k = 0; k < mbs; k++) {
1945       v  = aa + ai[k];
1946       vj = aj + ai[k];
1947       nz = ai[k + 1] - ai[k];
1948       while (nz--) {
1949         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1950         v++;
1951         vj++;
1952       }
1953       for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */
1954     }
1955 
1956     /* solve U*perm(x) = y by back substitution */
1957     for (k = mbs - 1; k >= 0; k--) {
1958       v  = aa + ai[k];
1959       vj = aj + ai[k];
1960       nz = ai[k + 1] - ai[k];
1961       while (nz--) {
1962         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1963         v++;
1964         vj++;
1965       }
1966       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1967     }
1968 
1969     PetscCall(ISRestoreIndices(isrow, &rp));
1970     PetscCall(VecRestoreArrayRead(bb->v, &b));
1971     PetscCall(VecRestoreArray(xx->v, &x));
1972     PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
1973   }
1974   PetscFunctionReturn(0);
1975 }
1976 
1977 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) {
1978   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1979   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
1980   const MatScalar   *aa = a->a, *v;
1981   const PetscScalar *b;
1982   PetscScalar       *x, xi;
1983   PetscInt           nz, i, j;
1984 
1985   PetscFunctionBegin;
1986   PetscCall(VecGetArrayRead(bb, &b));
1987   PetscCall(VecGetArray(xx, &x));
1988   /* solve U^T*D*y = b by forward substitution */
1989   PetscCall(PetscArraycpy(x, b, mbs));
1990   for (i = 0; i < mbs; i++) {
1991     v  = aa + ai[i];
1992     vj = aj + ai[i];
1993     xi = x[i];
1994     nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
1995     for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
1996     x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
1997   }
1998   /* solve U*x = y by backward substitution */
1999   for (i = mbs - 2; i >= 0; i--) {
2000     xi = x[i];
2001     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2002     vj = aj + adiag[i] - 1;
2003     nz = ai[i + 1] - ai[i] - 1;
2004     for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2005     x[i] = xi;
2006   }
2007   PetscCall(VecRestoreArrayRead(bb, &b));
2008   PetscCall(VecRestoreArray(xx, &x));
2009   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2010   PetscFunctionReturn(0);
2011 }
2012 
2013 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X) {
2014   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2015   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2016   const MatScalar   *aa = a->a, *v;
2017   const PetscScalar *b;
2018   PetscScalar       *x, xi;
2019   PetscInt           nz, i, j, neq, ldb, ldx;
2020   PetscBool          isdense;
2021 
2022   PetscFunctionBegin;
2023   if (!mbs) PetscFunctionReturn(0);
2024   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense));
2025   PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix");
2026   if (X != B) {
2027     PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense));
2028     PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix");
2029   }
2030   PetscCall(MatDenseGetArrayRead(B, &b));
2031   PetscCall(MatDenseGetLDA(B, &ldb));
2032   PetscCall(MatDenseGetArray(X, &x));
2033   PetscCall(MatDenseGetLDA(X, &ldx));
2034   for (neq = 0; neq < B->cmap->n; neq++) {
2035     /* solve U^T*D*y = b by forward substitution */
2036     PetscCall(PetscArraycpy(x, b, mbs));
2037     for (i = 0; i < mbs; i++) {
2038       v  = aa + ai[i];
2039       vj = aj + ai[i];
2040       xi = x[i];
2041       nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2042       for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2043       x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2044     }
2045     /* solve U*x = y by backward substitution */
2046     for (i = mbs - 2; i >= 0; i--) {
2047       xi = x[i];
2048       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2049       vj = aj + adiag[i] - 1;
2050       nz = ai[i + 1] - ai[i] - 1;
2051       for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2052       x[i] = xi;
2053     }
2054     b += ldb;
2055     x += ldx;
2056   }
2057   PetscCall(MatDenseRestoreArrayRead(B, &b));
2058   PetscCall(MatDenseRestoreArray(X, &x));
2059   PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs)));
2060   PetscFunctionReturn(0);
2061 }
2062 
2063 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
2064   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2065   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2066   const MatScalar   *aa = a->a, *v;
2067   PetscScalar       *x, xk;
2068   const PetscScalar *b;
2069   PetscInt           nz, k;
2070 
2071   PetscFunctionBegin;
2072   PetscCall(VecGetArrayRead(bb, &b));
2073   PetscCall(VecGetArray(xx, &x));
2074 
2075   /* solve U^T*D*y = b by forward substitution */
2076   PetscCall(PetscArraycpy(x, b, mbs));
2077   for (k = 0; k < mbs; k++) {
2078     v  = aa + ai[k] + 1;
2079     vj = aj + ai[k] + 1;
2080     xk = x[k];
2081     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2082     while (nz--) x[*vj++] += (*v++) * xk;
2083     x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2084   }
2085 
2086   /* solve U*x = y by back substitution */
2087   for (k = mbs - 2; k >= 0; k--) {
2088     v  = aa + ai[k] + 1;
2089     vj = aj + ai[k] + 1;
2090     xk = x[k];
2091     nz = ai[k + 1] - ai[k] - 1;
2092     while (nz--) { xk += (*v++) * x[*vj++]; }
2093     x[k] = xk;
2094   }
2095 
2096   PetscCall(VecRestoreArrayRead(bb, &b));
2097   PetscCall(VecRestoreArray(xx, &x));
2098   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2099   PetscFunctionReturn(0);
2100 }
2101 
2102 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) {
2103   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2104   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2105   const MatScalar   *aa = a->a, *v;
2106   PetscReal          diagk;
2107   PetscScalar       *x;
2108   const PetscScalar *b;
2109   PetscInt           nz, k;
2110 
2111   PetscFunctionBegin;
2112   /* solve U^T*D^(1/2)*x = b by forward substitution */
2113   PetscCall(VecGetArrayRead(bb, &b));
2114   PetscCall(VecGetArray(xx, &x));
2115   PetscCall(PetscArraycpy(x, b, mbs));
2116   for (k = 0; k < mbs; k++) {
2117     v  = aa + ai[k];
2118     vj = aj + ai[k];
2119     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2120     while (nz--) x[*vj++] += (*v++) * x[k];
2121     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2122     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]]));
2123     x[k] *= PetscSqrtReal(diagk);
2124   }
2125   PetscCall(VecRestoreArrayRead(bb, &b));
2126   PetscCall(VecRestoreArray(xx, &x));
2127   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2128   PetscFunctionReturn(0);
2129 }
2130 
2131 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
2132   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2133   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2134   const MatScalar   *aa = a->a, *v;
2135   PetscReal          diagk;
2136   PetscScalar       *x;
2137   const PetscScalar *b;
2138   PetscInt           nz, k;
2139 
2140   PetscFunctionBegin;
2141   /* solve U^T*D^(1/2)*x = b by forward substitution */
2142   PetscCall(VecGetArrayRead(bb, &b));
2143   PetscCall(VecGetArray(xx, &x));
2144   PetscCall(PetscArraycpy(x, b, mbs));
2145   for (k = 0; k < mbs; k++) {
2146     v  = aa + ai[k] + 1;
2147     vj = aj + ai[k] + 1;
2148     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2149     while (nz--) x[*vj++] += (*v++) * x[k];
2150     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2151     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]]));
2152     x[k] *= PetscSqrtReal(diagk);
2153   }
2154   PetscCall(VecRestoreArrayRead(bb, &b));
2155   PetscCall(VecRestoreArray(xx, &x));
2156   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2157   PetscFunctionReturn(0);
2158 }
2159 
2160 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx) {
2161   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2162   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2163   const MatScalar   *aa = a->a, *v;
2164   PetscReal          diagk;
2165   PetscScalar       *x;
2166   const PetscScalar *b;
2167   PetscInt           nz, k;
2168 
2169   PetscFunctionBegin;
2170   /* solve D^(1/2)*U*x = b by back substitution */
2171   PetscCall(VecGetArrayRead(bb, &b));
2172   PetscCall(VecGetArray(xx, &x));
2173 
2174   for (k = mbs - 1; k >= 0; k--) {
2175     v     = aa + ai[k];
2176     vj    = aj + ai[k];
2177     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2178     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2179     x[k] = PetscSqrtReal(diagk) * b[k];
2180     nz   = ai[k + 1] - ai[k] - 1;
2181     while (nz--) x[k] += (*v++) * x[*vj++];
2182   }
2183   PetscCall(VecRestoreArrayRead(bb, &b));
2184   PetscCall(VecRestoreArray(xx, &x));
2185   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2186   PetscFunctionReturn(0);
2187 }
2188 
2189 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) {
2190   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2191   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2192   const MatScalar   *aa = a->a, *v;
2193   PetscReal          diagk;
2194   PetscScalar       *x;
2195   const PetscScalar *b;
2196   PetscInt           nz, k;
2197 
2198   PetscFunctionBegin;
2199   /* solve D^(1/2)*U*x = b by back substitution */
2200   PetscCall(VecGetArrayRead(bb, &b));
2201   PetscCall(VecGetArray(xx, &x));
2202 
2203   for (k = mbs - 1; k >= 0; k--) {
2204     v     = aa + ai[k] + 1;
2205     vj    = aj + ai[k] + 1;
2206     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2207     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2208     x[k] = PetscSqrtReal(diagk) * b[k];
2209     nz   = ai[k + 1] - ai[k] - 1;
2210     while (nz--) x[k] += (*v++) * x[*vj++];
2211   }
2212   PetscCall(VecRestoreArrayRead(bb, &b));
2213   PetscCall(VecRestoreArray(xx, &x));
2214   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2215   PetscFunctionReturn(0);
2216 }
2217 
2218 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2219 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info) {
2220   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b;
2221   const PetscInt *rip, mbs    = a->mbs, *ai, *aj;
2222   PetscInt       *jutmp, bs   = A->rmap->bs, i;
2223   PetscInt        m, reallocs = 0, *levtmp;
2224   PetscInt       *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
2225   PetscInt        incrlev, *lev, shift, prow, nz;
2226   PetscReal       f = info->fill, levels = info->levels;
2227   PetscBool       perm_identity;
2228 
2229   PetscFunctionBegin;
2230   /* check whether perm is the identity mapping */
2231   PetscCall(ISIdentity(perm, &perm_identity));
2232 
2233   if (perm_identity) {
2234     a->permute = PETSC_FALSE;
2235     ai         = a->i;
2236     aj         = a->j;
2237   } else { /*  non-trivial permutation */
2238     SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2239   }
2240 
2241   /* initialization */
2242   PetscCall(ISGetIndices(perm, &rip));
2243   umax = (PetscInt)(f * ai[mbs] + 1);
2244   PetscCall(PetscMalloc1(umax, &lev));
2245   umax += mbs + 1;
2246   shift = mbs + 1;
2247   PetscCall(PetscMalloc1(mbs + 1, &iu));
2248   PetscCall(PetscMalloc1(umax, &ju));
2249   iu[0] = mbs + 1;
2250   juidx = mbs + 1;
2251   /* prowl: linked list for pivot row */
2252   PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp));
2253   /* q: linked list for col index */
2254 
2255   for (i = 0; i < mbs; i++) {
2256     prowl[i] = mbs;
2257     q[i]     = 0;
2258   }
2259 
2260   /* for each row k */
2261   for (k = 0; k < mbs; k++) {
2262     nzk  = 0;
2263     q[k] = mbs;
2264     /* copy current row into linked list */
2265     nz   = ai[rip[k] + 1] - ai[rip[k]];
2266     j    = ai[rip[k]];
2267     while (nz--) {
2268       vj = rip[aj[j++]];
2269       if (vj > k) {
2270         qm = k;
2271         do {
2272           m  = qm;
2273           qm = q[m];
2274         } while (qm < vj);
2275         PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
2276         nzk++;
2277         q[m]       = vj;
2278         q[vj]      = qm;
2279         levtmp[vj] = 0; /* initialize lev for nonzero element */
2280       }
2281     }
2282 
2283     /* modify nonzero structure of k-th row by computing fill-in
2284        for each row prow to be merged in */
2285     prow = k;
2286     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2287 
2288     while (prow < k) {
2289       /* merge row prow into k-th row */
2290       jmin = iu[prow] + 1;
2291       jmax = iu[prow + 1];
2292       qm   = k;
2293       for (j = jmin; j < jmax; j++) {
2294         incrlev = lev[j - shift] + 1;
2295         if (incrlev > levels) continue;
2296         vj = ju[j];
2297         do {
2298           m  = qm;
2299           qm = q[m];
2300         } while (qm < vj);
2301         if (qm != vj) { /* a fill */
2302           nzk++;
2303           q[m]       = vj;
2304           q[vj]      = qm;
2305           qm         = vj;
2306           levtmp[vj] = incrlev;
2307         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2308       }
2309       prow = prowl[prow]; /* next pivot row */
2310     }
2311 
2312     /* add k to row list for first nonzero element in k-th row */
2313     if (nzk > 1) {
2314       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2315       prowl[k] = prowl[i];
2316       prowl[i] = k;
2317     }
2318     iu[k + 1] = iu[k] + nzk;
2319 
2320     /* allocate more space to ju and lev if needed */
2321     if (iu[k + 1] > umax) {
2322       /* estimate how much additional space we will need */
2323       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2324       /* just double the memory each time */
2325       maxadd = umax;
2326       if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
2327       umax += maxadd;
2328 
2329       /* allocate a longer ju */
2330       PetscCall(PetscMalloc1(umax, &jutmp));
2331       PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
2332       PetscCall(PetscFree(ju));
2333       ju = jutmp;
2334 
2335       PetscCall(PetscMalloc1(umax, &jutmp));
2336       PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift));
2337       PetscCall(PetscFree(lev));
2338       lev = jutmp;
2339       reallocs += 2; /* count how many times we realloc */
2340     }
2341 
2342     /* save nonzero structure of k-th row in ju */
2343     i = k;
2344     while (nzk--) {
2345       i                  = q[i];
2346       ju[juidx]          = i;
2347       lev[juidx - shift] = levtmp[i];
2348       juidx++;
2349     }
2350   }
2351 
2352 #if defined(PETSC_USE_INFO)
2353   if (ai[mbs] != 0) {
2354     PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2355     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
2356     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2357     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
2358     PetscCall(PetscInfo(A, "for best performance.\n"));
2359   } else {
2360     PetscCall(PetscInfo(A, "Empty matrix\n"));
2361   }
2362 #endif
2363 
2364   PetscCall(ISRestoreIndices(perm, &rip));
2365   PetscCall(PetscFree3(prowl, q, levtmp));
2366   PetscCall(PetscFree(lev));
2367 
2368   /* put together the new matrix */
2369   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL));
2370 
2371   /* PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)iperm)); */
2372   b = (Mat_SeqSBAIJ *)(B)->data;
2373   PetscCall(PetscFree2(b->imax, b->ilen));
2374 
2375   b->singlemalloc = PETSC_FALSE;
2376   b->free_a       = PETSC_TRUE;
2377   b->free_ij      = PETSC_TRUE;
2378   /* the next line frees the default space generated by the Create() */
2379   PetscCall(PetscFree3(b->a, b->j, b->i));
2380   PetscCall(PetscMalloc1((iu[mbs] + 1) * a->bs2, &b->a));
2381   b->j    = ju;
2382   b->i    = iu;
2383   b->diag = NULL;
2384   b->ilen = NULL;
2385   b->imax = NULL;
2386 
2387   PetscCall(ISDestroy(&b->row));
2388   PetscCall(ISDestroy(&b->icol));
2389   b->row  = perm;
2390   b->icol = perm;
2391   PetscCall(PetscObjectReference((PetscObject)perm));
2392   PetscCall(PetscObjectReference((PetscObject)perm));
2393   PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
2394   /* In b structure:  Free imax, ilen, old a, old j.
2395      Allocate idnew, solve_work, new a, new j */
2396   PetscCall(PetscLogObjectMemory((PetscObject)B, (iu[mbs] - mbs) * (sizeof(PetscInt) + sizeof(MatScalar))));
2397   b->maxnz = b->nz = iu[mbs];
2398 
2399   (B)->info.factor_mallocs   = reallocs;
2400   (B)->info.fill_ratio_given = f;
2401   if (ai[mbs] != 0) {
2402     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2403   } else {
2404     (B)->info.fill_ratio_needed = 0.0;
2405   }
2406   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity));
2407   PetscFunctionReturn(0);
2408 }
2409 
2410 /*
2411   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2412 */
2413 #include <petscbt.h>
2414 #include <../src/mat/utils/freespace.h>
2415 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info) {
2416   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data, *b;
2417   PetscBool          perm_identity, free_ij = PETSC_TRUE, missing;
2418   PetscInt           bs = A->rmap->bs, am = a->mbs, d, *ai = a->i, *aj = a->j;
2419   const PetscInt    *rip;
2420   PetscInt           reallocs = 0, i, *ui, *udiag, *cols;
2421   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2422   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr;
2423   PetscReal          fill = info->fill, levels = info->levels;
2424   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2425   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2426   PetscBT            lnkbt;
2427 
2428   PetscFunctionBegin;
2429   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2430   PetscCall(MatMissingDiagonal(A, &missing, &d));
2431   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d);
2432   if (bs > 1) {
2433     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
2434     PetscFunctionReturn(0);
2435   }
2436 
2437   /* check whether perm is the identity mapping */
2438   PetscCall(ISIdentity(perm, &perm_identity));
2439   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2440   a->permute = PETSC_FALSE;
2441 
2442   PetscCall(PetscMalloc1(am + 1, &ui));
2443   PetscCall(PetscMalloc1(am + 1, &udiag));
2444   ui[0] = 0;
2445 
2446   /* ICC(0) without matrix ordering: simply rearrange column indices */
2447   if (!levels) {
2448     /* reuse the column pointers and row offsets for memory savings */
2449     for (i = 0; i < am; i++) {
2450       ncols     = ai[i + 1] - ai[i];
2451       ui[i + 1] = ui[i] + ncols;
2452       udiag[i]  = ui[i + 1] - 1; /* points to the last entry of U(i,:) */
2453     }
2454     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2455     cols = uj;
2456     for (i = 0; i < am; i++) {
2457       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2458       ncols = ai[i + 1] - ai[i] - 1;
2459       for (j = 0; j < ncols; j++) *cols++ = aj[j];
2460       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2461     }
2462   } else { /* case: levels>0 */
2463     PetscCall(ISGetIndices(perm, &rip));
2464 
2465     /* initialization */
2466     /* jl: linked list for storing indices of the pivot rows
2467        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2468     PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2469     for (i = 0; i < am; i++) {
2470       jl[i] = am;
2471       il[i] = 0;
2472     }
2473 
2474     /* create and initialize a linked list for storing column indices of the active row k */
2475     nlnk = am + 1;
2476     PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2477 
2478     /* initial FreeSpace size is fill*(ai[am]+1) */
2479     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2480 
2481     current_space = free_space;
2482 
2483     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2484 
2485     current_space_lvl = free_space_lvl;
2486 
2487     for (k = 0; k < am; k++) { /* for each active row k */
2488       /* initialize lnk by the column indices of row k */
2489       nzk   = 0;
2490       ncols = ai[k + 1] - ai[k];
2491       PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
2492       cols = aj + ai[k];
2493       PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2494       nzk += nlnk;
2495 
2496       /* update lnk by computing fill-in for each pivot row to be merged in */
2497       prow = jl[k]; /* 1st pivot row */
2498 
2499       while (prow < k) {
2500         nextprow = jl[prow];
2501 
2502         /* merge prow into k-th row */
2503         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2504         jmax  = ui[prow + 1];
2505         ncols = jmax - jmin;
2506         i     = jmin - ui[prow];
2507 
2508         cols = uj_ptr[prow] + i;     /* points to the 2nd nzero entry in U(prow,k:am-1) */
2509         uj   = uj_lvl_ptr[prow] + i; /* levels of cols */
2510         j    = *(uj - 1);
2511         PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2512         nzk += nlnk;
2513 
2514         /* update il and jl for prow */
2515         if (jmin < jmax) {
2516           il[prow] = jmin;
2517 
2518           j        = *cols;
2519           jl[prow] = jl[j];
2520           jl[j]    = prow;
2521         }
2522         prow = nextprow;
2523       }
2524 
2525       /* if free space is not available, make more free space */
2526       if (current_space->local_remaining < nzk) {
2527         i = am - k + 1;                                    /* num of unfactored rows */
2528         i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2529         PetscCall(PetscFreeSpaceGet(i, &current_space));
2530         PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
2531         reallocs++;
2532       }
2533 
2534       /* copy data into free_space and free_space_lvl, then initialize lnk */
2535       PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k);
2536       PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2537 
2538       /* add the k-th row into il and jl */
2539       if (nzk > 1) {
2540         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2541         jl[k] = jl[i];
2542         jl[i] = k;
2543         il[k] = ui[k] + 1;
2544       }
2545       uj_ptr[k]     = current_space->array;
2546       uj_lvl_ptr[k] = current_space_lvl->array;
2547 
2548       current_space->array += nzk;
2549       current_space->local_used += nzk;
2550       current_space->local_remaining -= nzk;
2551       current_space_lvl->array += nzk;
2552       current_space_lvl->local_used += nzk;
2553       current_space_lvl->local_remaining -= nzk;
2554 
2555       ui[k + 1] = ui[k] + nzk;
2556     }
2557 
2558     PetscCall(ISRestoreIndices(perm, &rip));
2559     PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2560 
2561     /* destroy list of free space and other temporary array(s) */
2562     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2563     PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor  */
2564     PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2565     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2566 
2567   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2568 
2569   /* put together the new matrix in MATSEQSBAIJ format */
2570   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2571 
2572   b = (Mat_SeqSBAIJ *)(fact)->data;
2573   PetscCall(PetscFree2(b->imax, b->ilen));
2574 
2575   b->singlemalloc = PETSC_FALSE;
2576   b->free_a       = PETSC_TRUE;
2577   b->free_ij      = free_ij;
2578 
2579   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
2580 
2581   b->j         = uj;
2582   b->i         = ui;
2583   b->diag      = udiag;
2584   b->free_diag = PETSC_TRUE;
2585   b->ilen      = NULL;
2586   b->imax      = NULL;
2587   b->row       = perm;
2588   b->col       = perm;
2589 
2590   PetscCall(PetscObjectReference((PetscObject)perm));
2591   PetscCall(PetscObjectReference((PetscObject)perm));
2592 
2593   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2594 
2595   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2596   PetscCall(PetscLogObjectMemory((PetscObject)fact, ui[am] * (sizeof(PetscInt) + sizeof(MatScalar))));
2597 
2598   b->maxnz = b->nz = ui[am];
2599 
2600   fact->info.factor_mallocs   = reallocs;
2601   fact->info.fill_ratio_given = fill;
2602   if (ai[am] != 0) {
2603     fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am];
2604   } else {
2605     fact->info.fill_ratio_needed = 0.0;
2606   }
2607 #if defined(PETSC_USE_INFO)
2608   if (ai[am] != 0) {
2609     PetscReal af = fact->info.fill_ratio_needed;
2610     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2611     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2612     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2613   } else {
2614     PetscCall(PetscInfo(A, "Empty matrix\n"));
2615   }
2616 #endif
2617   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2618   PetscFunctionReturn(0);
2619 }
2620 
2621 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info) {
2622   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
2623   Mat_SeqSBAIJ      *b;
2624   PetscBool          perm_identity, free_ij = PETSC_TRUE;
2625   PetscInt           bs = A->rmap->bs, am = a->mbs;
2626   const PetscInt    *cols, *rip, *ai = a->i, *aj = a->j;
2627   PetscInt           reallocs = 0, i, *ui;
2628   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2629   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
2630   PetscReal          fill = info->fill, levels = info->levels, ratio_needed;
2631   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2632   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2633   PetscBT            lnkbt;
2634 
2635   PetscFunctionBegin;
2636   /*
2637    This code originally uses Modified Sparse Row (MSR) storage
2638    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2639    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2640    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2641    thus the original code in MSR format is still used for these cases.
2642    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2643    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2644   */
2645   if (bs > 1) {
2646     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
2647     PetscFunctionReturn(0);
2648   }
2649 
2650   /* check whether perm is the identity mapping */
2651   PetscCall(ISIdentity(perm, &perm_identity));
2652   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2653   a->permute = PETSC_FALSE;
2654 
2655   /* special case that simply copies fill pattern */
2656   if (!levels) {
2657     /* reuse the column pointers and row offsets for memory savings */
2658     ui           = a->i;
2659     uj           = a->j;
2660     free_ij      = PETSC_FALSE;
2661     ratio_needed = 1.0;
2662   } else { /* case: levels>0 */
2663     PetscCall(ISGetIndices(perm, &rip));
2664 
2665     /* initialization */
2666     PetscCall(PetscMalloc1(am + 1, &ui));
2667     ui[0] = 0;
2668 
2669     /* jl: linked list for storing indices of the pivot rows
2670        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2671     PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2672     for (i = 0; i < am; i++) {
2673       jl[i] = am;
2674       il[i] = 0;
2675     }
2676 
2677     /* create and initialize a linked list for storing column indices of the active row k */
2678     nlnk = am + 1;
2679     PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2680 
2681     /* initial FreeSpace size is fill*(ai[am]+1) */
2682     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2683 
2684     current_space = free_space;
2685 
2686     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2687 
2688     current_space_lvl = free_space_lvl;
2689 
2690     for (k = 0; k < am; k++) { /* for each active row k */
2691       /* initialize lnk by the column indices of row rip[k] */
2692       nzk   = 0;
2693       ncols = ai[rip[k] + 1] - ai[rip[k]];
2694       cols  = aj + ai[rip[k]];
2695       PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2696       nzk += nlnk;
2697 
2698       /* update lnk by computing fill-in for each pivot row to be merged in */
2699       prow = jl[k]; /* 1st pivot row */
2700 
2701       while (prow < k) {
2702         nextprow = jl[prow];
2703 
2704         /* merge prow into k-th row */
2705         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2706         jmax     = ui[prow + 1];
2707         ncols    = jmax - jmin;
2708         i        = jmin - ui[prow];
2709         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2710         j        = *(uj_lvl_ptr[prow] + i - 1);
2711         cols_lvl = uj_lvl_ptr[prow] + i;
2712         PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2713         nzk += nlnk;
2714 
2715         /* update il and jl for prow */
2716         if (jmin < jmax) {
2717           il[prow] = jmin;
2718           j        = *cols;
2719           jl[prow] = jl[j];
2720           jl[j]    = prow;
2721         }
2722         prow = nextprow;
2723       }
2724 
2725       /* if free space is not available, make more free space */
2726       if (current_space->local_remaining < nzk) {
2727         i = am - k + 1;                                                             /* num of unfactored rows */
2728         i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2729         PetscCall(PetscFreeSpaceGet(i, &current_space));
2730         PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
2731         reallocs++;
2732       }
2733 
2734       /* copy data into free_space and free_space_lvl, then initialize lnk */
2735       PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2736 
2737       /* add the k-th row into il and jl */
2738       if (nzk - 1 > 0) {
2739         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2740         jl[k] = jl[i];
2741         jl[i] = k;
2742         il[k] = ui[k] + 1;
2743       }
2744       uj_ptr[k]     = current_space->array;
2745       uj_lvl_ptr[k] = current_space_lvl->array;
2746 
2747       current_space->array += nzk;
2748       current_space->local_used += nzk;
2749       current_space->local_remaining -= nzk;
2750       current_space_lvl->array += nzk;
2751       current_space_lvl->local_used += nzk;
2752       current_space_lvl->local_remaining -= nzk;
2753 
2754       ui[k + 1] = ui[k] + nzk;
2755     }
2756 
2757     PetscCall(ISRestoreIndices(perm, &rip));
2758     PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2759 
2760     /* destroy list of free space and other temporary array(s) */
2761     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2762     PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
2763     PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2764     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2765     if (ai[am] != 0) {
2766       ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]);
2767     } else {
2768       ratio_needed = 0.0;
2769     }
2770   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2771 
2772   /* put together the new matrix in MATSEQSBAIJ format */
2773   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2774 
2775   b = (Mat_SeqSBAIJ *)(fact)->data;
2776 
2777   PetscCall(PetscFree2(b->imax, b->ilen));
2778 
2779   b->singlemalloc = PETSC_FALSE;
2780   b->free_a       = PETSC_TRUE;
2781   b->free_ij      = free_ij;
2782 
2783   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
2784 
2785   b->j             = uj;
2786   b->i             = ui;
2787   b->diag          = NULL;
2788   b->ilen          = NULL;
2789   b->imax          = NULL;
2790   b->row           = perm;
2791   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2792 
2793   PetscCall(PetscObjectReference((PetscObject)perm));
2794   b->icol = perm;
2795   PetscCall(PetscObjectReference((PetscObject)perm));
2796   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2797 
2798   b->maxnz = b->nz = ui[am];
2799 
2800   fact->info.factor_mallocs    = reallocs;
2801   fact->info.fill_ratio_given  = fill;
2802   fact->info.fill_ratio_needed = ratio_needed;
2803 #if defined(PETSC_USE_INFO)
2804   if (ai[am] != 0) {
2805     PetscReal af = fact->info.fill_ratio_needed;
2806     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2807     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2808     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2809   } else {
2810     PetscCall(PetscInfo(A, "Empty matrix\n"));
2811   }
2812 #endif
2813   if (perm_identity) {
2814     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2815   } else {
2816     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2817   }
2818   PetscFunctionReturn(0);
2819 }
2820