xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision 8d47944aebcd8a3a8a4a1aebd4b3f440b4d70fab)
1 
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include <../src/mat/impls/sbaij/seq/sbaij.h>
7 #include <../src/mat/impls/baij/seq/baij.h>
8 #include <petsc/private/kernels/blockinvert.h>
9 
10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
11 {
12   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
13   IS                 isrow = a->row;
14   PetscInt           mbs = a->mbs, *ai = a->i, *aj = a->j;
15   const PetscInt    *r;
16   PetscInt           nz, *vj, k, idx, k1;
17   PetscInt           bs = A->rmap->bs, bs2 = a->bs2;
18   const MatScalar   *aa = a->a, *v, *diag;
19   PetscScalar       *x, *xk, *xj, *xk_tmp, *t;
20   const PetscScalar *b;
21 
22   PetscFunctionBegin;
23   PetscCall(VecGetArrayRead(bb, &b));
24   PetscCall(VecGetArray(xx, &x));
25   t = a->solve_work;
26   PetscCall(ISGetIndices(isrow, &r));
27   PetscCall(PetscMalloc1(bs, &xk_tmp));
28 
29   /* solve U^T * D * y = b by forward substitution */
30   xk = t;
31   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
32     idx = bs * r[k];
33     for (k1 = 0; k1 < bs; k1++) *xk++ = b[idx + k1];
34   }
35   for (k = 0; k < mbs; k++) {
36     v  = aa + bs2 * ai[k];
37     xk = t + k * bs;                          /* Dk*xk = k-th block of x */
38     PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
39     nz = ai[k + 1] - ai[k];
40     vj = aj + ai[k];
41     xj = t + (*vj) * bs; /* *vj-th block of x, *vj>k */
42     while (nz--) {
43       /* x(:) += U(k,:)^T*(Dk*xk) */
44       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
45       vj++;
46       xj = t + (*vj) * bs;
47       v += bs2;
48     }
49     /* xk = inv(Dk)*(Dk*xk) */
50     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
51     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
52   }
53 
54   /* solve U*x = y by back substitution */
55   for (k = mbs - 1; k >= 0; k--) {
56     v  = aa + bs2 * ai[k];
57     xk = t + k * bs; /* xk */
58     nz = ai[k + 1] - ai[k];
59     vj = aj + ai[k];
60     xj = t + (*vj) * bs;
61     while (nz--) {
62       /* xk += U(k,:)*x(:) */
63       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
64       vj++;
65       v += bs2;
66       xj = t + (*vj) * bs;
67     }
68     idx = bs * r[k];
69     for (k1 = 0; k1 < bs; k1++) x[idx + k1] = *xk++;
70   }
71 
72   PetscCall(PetscFree(xk_tmp));
73   PetscCall(ISRestoreIndices(isrow, &r));
74   PetscCall(VecRestoreArrayRead(bb, &b));
75   PetscCall(VecRestoreArray(xx, &x));
76   PetscCall(PetscLogFlops(4.0 * bs2 * a->nz - (bs + 2.0 * bs2) * mbs));
77   PetscFunctionReturn(PETSC_SUCCESS);
78 }
79 
80 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
81 {
82   PetscFunctionBegin;
83   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "not implemented yet");
84 }
85 
86 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
87 {
88   PetscFunctionBegin;
89   SETERRQ(PETSC_COMM_SELF, PETSC_ERR_SUP, "Not yet implemented");
90 }
91 
92 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
93 {
94   PetscInt         nz, k;
95   const PetscInt  *vj, bs2 = bs * bs;
96   const MatScalar *v, *diag;
97   PetscScalar     *xk, *xj, *xk_tmp;
98 
99   PetscFunctionBegin;
100   PetscCall(PetscMalloc1(bs, &xk_tmp));
101   for (k = 0; k < mbs; k++) {
102     v  = aa + bs2 * ai[k];
103     xk = x + k * bs;                          /* Dk*xk = k-th block of x */
104     PetscCall(PetscArraycpy(xk_tmp, xk, bs)); /* xk_tmp <- xk */
105     nz = ai[k + 1] - ai[k];
106     vj = aj + ai[k];
107     xj = x + (size_t)(*vj) * bs; /* *vj-th block of x, *vj>k */
108     while (nz--) {
109       /* x(:) += U(k,:)^T*(Dk*xk) */
110       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs, xj, v, xk_tmp); /* xj <- xj + v^t * xk */
111       vj++;
112       xj = x + (size_t)(*vj) * bs;
113       v += bs2;
114     }
115     /* xk = inv(Dk)*(Dk*xk) */
116     diag = aa + k * bs2;                                /* ptr to inv(Dk) */
117     PetscKernel_w_gets_A_times_v(bs, xk_tmp, diag, xk); /* xk <- diag * xk */
118   }
119   PetscCall(PetscFree(xk_tmp));
120   PetscFunctionReturn(PETSC_SUCCESS);
121 }
122 
123 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscInt bs, PetscScalar *x)
124 {
125   PetscInt         nz, k;
126   const PetscInt  *vj, bs2 = bs * bs;
127   const MatScalar *v;
128   PetscScalar     *xk, *xj;
129 
130   PetscFunctionBegin;
131   for (k = mbs - 1; k >= 0; k--) {
132     v  = aa + bs2 * ai[k];
133     xk = x + k * bs; /* xk */
134     nz = ai[k + 1] - ai[k];
135     vj = aj + ai[k];
136     xj = x + (size_t)(*vj) * bs;
137     while (nz--) {
138       /* xk += U(k,:)*x(:) */
139       PetscKernel_v_gets_v_plus_A_times_w(bs, xk, v, xj); /* xk <- xk + v*xj */
140       vj++;
141       v += bs2;
142       xj = x + (size_t)(*vj) * bs;
143     }
144   }
145   PetscFunctionReturn(PETSC_SUCCESS);
146 }
147 
148 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
149 {
150   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
151   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
152   PetscInt           bs = A->rmap->bs;
153   const MatScalar   *aa = a->a;
154   PetscScalar       *x;
155   const PetscScalar *b;
156 
157   PetscFunctionBegin;
158   PetscCall(VecGetArrayRead(bb, &b));
159   PetscCall(VecGetArray(xx, &x));
160 
161   /* solve U^T * D * y = b by forward substitution */
162   PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
163   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
164 
165   /* solve U*x = y by back substitution */
166   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
167 
168   PetscCall(VecRestoreArrayRead(bb, &b));
169   PetscCall(VecRestoreArray(xx, &x));
170   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (bs + 2.0 * a->bs2) * mbs));
171   PetscFunctionReturn(PETSC_SUCCESS);
172 }
173 
174 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
175 {
176   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
177   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
178   PetscInt           bs = A->rmap->bs;
179   const MatScalar   *aa = a->a;
180   const PetscScalar *b;
181   PetscScalar       *x;
182 
183   PetscFunctionBegin;
184   PetscCall(VecGetArrayRead(bb, &b));
185   PetscCall(VecGetArray(xx, &x));
186   PetscCall(PetscArraycpy(x, b, bs * mbs)); /* x <- b */
187   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
188   PetscCall(VecRestoreArrayRead(bb, &b));
189   PetscCall(VecRestoreArray(xx, &x));
190   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - bs * mbs));
191   PetscFunctionReturn(PETSC_SUCCESS);
192 }
193 
194 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
195 {
196   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
197   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
198   PetscInt           bs = A->rmap->bs;
199   const MatScalar   *aa = a->a;
200   const PetscScalar *b;
201   PetscScalar       *x;
202 
203   PetscFunctionBegin;
204   PetscCall(VecGetArrayRead(bb, &b));
205   PetscCall(VecGetArray(xx, &x));
206   PetscCall(PetscArraycpy(x, b, bs * mbs));
207   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai, aj, aa, mbs, bs, x));
208   PetscCall(VecRestoreArrayRead(bb, &b));
209   PetscCall(VecRestoreArray(xx, &x));
210   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
211   PetscFunctionReturn(PETSC_SUCCESS);
212 }
213 
214 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
215 {
216   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
217   IS                 isrow = a->row;
218   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
219   PetscInt           nz, k, idx;
220   const MatScalar   *aa = a->a, *v, *d;
221   PetscScalar       *x, x0, x1, x2, x3, x4, x5, x6, *t, *tp;
222   const PetscScalar *b;
223 
224   PetscFunctionBegin;
225   PetscCall(VecGetArrayRead(bb, &b));
226   PetscCall(VecGetArray(xx, &x));
227   t = a->solve_work;
228   PetscCall(ISGetIndices(isrow, &r));
229 
230   /* solve U^T * D * y = b by forward substitution */
231   tp = t;
232   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
233     idx   = 7 * r[k];
234     tp[0] = b[idx];
235     tp[1] = b[idx + 1];
236     tp[2] = b[idx + 2];
237     tp[3] = b[idx + 3];
238     tp[4] = b[idx + 4];
239     tp[5] = b[idx + 5];
240     tp[6] = b[idx + 6];
241     tp += 7;
242   }
243 
244   for (k = 0; k < mbs; k++) {
245     v  = aa + 49 * ai[k];
246     vj = aj + ai[k];
247     tp = t + k * 7;
248     x0 = tp[0];
249     x1 = tp[1];
250     x2 = tp[2];
251     x3 = tp[3];
252     x4 = tp[4];
253     x5 = tp[5];
254     x6 = tp[6];
255     nz = ai[k + 1] - ai[k];
256     tp = t + (*vj) * 7;
257     while (nz--) {
258       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
259       tp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
260       tp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
261       tp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
262       tp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
263       tp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
264       tp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
265       vj++;
266       tp = t + (*vj) * 7;
267       v += 49;
268     }
269 
270     /* xk = inv(Dk)*(Dk*xk) */
271     d     = aa + k * 49; /* ptr to inv(Dk) */
272     tp    = t + k * 7;
273     tp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
274     tp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
275     tp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
276     tp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
277     tp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
278     tp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
279     tp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
280   }
281 
282   /* solve U*x = y by back substitution */
283   for (k = mbs - 1; k >= 0; k--) {
284     v  = aa + 49 * ai[k];
285     vj = aj + ai[k];
286     tp = t + k * 7;
287     x0 = tp[0];
288     x1 = tp[1];
289     x2 = tp[2];
290     x3 = tp[3];
291     x4 = tp[4];
292     x5 = tp[5];
293     x6 = tp[6]; /* xk */
294     nz = ai[k + 1] - ai[k];
295 
296     tp = t + (*vj) * 7;
297     while (nz--) {
298       /* xk += U(k,:)*x(:) */
299       x0 += v[0] * tp[0] + v[7] * tp[1] + v[14] * tp[2] + v[21] * tp[3] + v[28] * tp[4] + v[35] * tp[5] + v[42] * tp[6];
300       x1 += v[1] * tp[0] + v[8] * tp[1] + v[15] * tp[2] + v[22] * tp[3] + v[29] * tp[4] + v[36] * tp[5] + v[43] * tp[6];
301       x2 += v[2] * tp[0] + v[9] * tp[1] + v[16] * tp[2] + v[23] * tp[3] + v[30] * tp[4] + v[37] * tp[5] + v[44] * tp[6];
302       x3 += v[3] * tp[0] + v[10] * tp[1] + v[17] * tp[2] + v[24] * tp[3] + v[31] * tp[4] + v[38] * tp[5] + v[45] * tp[6];
303       x4 += v[4] * tp[0] + v[11] * tp[1] + v[18] * tp[2] + v[25] * tp[3] + v[32] * tp[4] + v[39] * tp[5] + v[46] * tp[6];
304       x5 += v[5] * tp[0] + v[12] * tp[1] + v[19] * tp[2] + v[26] * tp[3] + v[33] * tp[4] + v[40] * tp[5] + v[47] * tp[6];
305       x6 += v[6] * tp[0] + v[13] * tp[1] + v[20] * tp[2] + v[27] * tp[3] + v[34] * tp[4] + v[41] * tp[5] + v[48] * tp[6];
306       vj++;
307       tp = t + (*vj) * 7;
308       v += 49;
309     }
310     tp         = t + k * 7;
311     tp[0]      = x0;
312     tp[1]      = x1;
313     tp[2]      = x2;
314     tp[3]      = x3;
315     tp[4]      = x4;
316     tp[5]      = x5;
317     tp[6]      = x6;
318     idx        = 7 * r[k];
319     x[idx]     = x0;
320     x[idx + 1] = x1;
321     x[idx + 2] = x2;
322     x[idx + 3] = x3;
323     x[idx + 4] = x4;
324     x[idx + 5] = x5;
325     x[idx + 6] = x6;
326   }
327 
328   PetscCall(ISRestoreIndices(isrow, &r));
329   PetscCall(VecRestoreArrayRead(bb, &b));
330   PetscCall(VecRestoreArray(xx, &x));
331   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
332   PetscFunctionReturn(PETSC_SUCCESS);
333 }
334 
335 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
336 {
337   const MatScalar *v, *d;
338   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
339   PetscInt         nz, k;
340   const PetscInt  *vj;
341 
342   PetscFunctionBegin;
343   for (k = 0; k < mbs; k++) {
344     v  = aa + 49 * ai[k];
345     xp = x + k * 7;
346     x0 = xp[0];
347     x1 = xp[1];
348     x2 = xp[2];
349     x3 = xp[3];
350     x4 = xp[4];
351     x5 = xp[5];
352     x6 = xp[6]; /* Dk*xk = k-th block of x */
353     nz = ai[k + 1] - ai[k];
354     vj = aj + ai[k];
355     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
356     PetscPrefetchBlock(v + 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
357     while (nz--) {
358       xp = x + (*vj) * 7;
359       /* x(:) += U(k,:)^T*(Dk*xk) */
360       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5 + v[6] * x6;
361       xp[1] += v[7] * x0 + v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4 + v[12] * x5 + v[13] * x6;
362       xp[2] += v[14] * x0 + v[15] * x1 + v[16] * x2 + v[17] * x3 + v[18] * x4 + v[19] * x5 + v[20] * x6;
363       xp[3] += v[21] * x0 + v[22] * x1 + v[23] * x2 + v[24] * x3 + v[25] * x4 + v[26] * x5 + v[27] * x6;
364       xp[4] += v[28] * x0 + v[29] * x1 + v[30] * x2 + v[31] * x3 + v[32] * x4 + v[33] * x5 + v[34] * x6;
365       xp[5] += v[35] * x0 + v[36] * x1 + v[37] * x2 + v[38] * x3 + v[39] * x4 + v[40] * x5 + v[41] * x6;
366       xp[6] += v[42] * x0 + v[43] * x1 + v[44] * x2 + v[45] * x3 + v[46] * x4 + v[47] * x5 + v[48] * x6;
367       vj++;
368       v += 49;
369     }
370     /* xk = inv(Dk)*(Dk*xk) */
371     d     = aa + k * 49; /* ptr to inv(Dk) */
372     xp    = x + k * 7;
373     xp[0] = d[0] * x0 + d[7] * x1 + d[14] * x2 + d[21] * x3 + d[28] * x4 + d[35] * x5 + d[42] * x6;
374     xp[1] = d[1] * x0 + d[8] * x1 + d[15] * x2 + d[22] * x3 + d[29] * x4 + d[36] * x5 + d[43] * x6;
375     xp[2] = d[2] * x0 + d[9] * x1 + d[16] * x2 + d[23] * x3 + d[30] * x4 + d[37] * x5 + d[44] * x6;
376     xp[3] = d[3] * x0 + d[10] * x1 + d[17] * x2 + d[24] * x3 + d[31] * x4 + d[38] * x5 + d[45] * x6;
377     xp[4] = d[4] * x0 + d[11] * x1 + d[18] * x2 + d[25] * x3 + d[32] * x4 + d[39] * x5 + d[46] * x6;
378     xp[5] = d[5] * x0 + d[12] * x1 + d[19] * x2 + d[26] * x3 + d[33] * x4 + d[40] * x5 + d[47] * x6;
379     xp[6] = d[6] * x0 + d[13] * x1 + d[20] * x2 + d[27] * x3 + d[34] * x4 + d[41] * x5 + d[48] * x6;
380   }
381   PetscFunctionReturn(PETSC_SUCCESS);
382 }
383 
384 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
385 {
386   const MatScalar *v;
387   PetscScalar     *xp, x0, x1, x2, x3, x4, x5, x6;
388   PetscInt         nz, k;
389   const PetscInt  *vj;
390 
391   PetscFunctionBegin;
392   for (k = mbs - 1; k >= 0; k--) {
393     v  = aa + 49 * ai[k];
394     xp = x + k * 7;
395     x0 = xp[0];
396     x1 = xp[1];
397     x2 = xp[2];
398     x3 = xp[3];
399     x4 = xp[4];
400     x5 = xp[5];
401     x6 = xp[6]; /* xk */
402     nz = ai[k + 1] - ai[k];
403     vj = aj + ai[k];
404     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
405     PetscPrefetchBlock(v - 49 * nz, 49 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
406     while (nz--) {
407       xp = x + (*vj) * 7;
408       /* xk += U(k,:)*x(:) */
409       x0 += v[0] * xp[0] + v[7] * xp[1] + v[14] * xp[2] + v[21] * xp[3] + v[28] * xp[4] + v[35] * xp[5] + v[42] * xp[6];
410       x1 += v[1] * xp[0] + v[8] * xp[1] + v[15] * xp[2] + v[22] * xp[3] + v[29] * xp[4] + v[36] * xp[5] + v[43] * xp[6];
411       x2 += v[2] * xp[0] + v[9] * xp[1] + v[16] * xp[2] + v[23] * xp[3] + v[30] * xp[4] + v[37] * xp[5] + v[44] * xp[6];
412       x3 += v[3] * xp[0] + v[10] * xp[1] + v[17] * xp[2] + v[24] * xp[3] + v[31] * xp[4] + v[38] * xp[5] + v[45] * xp[6];
413       x4 += v[4] * xp[0] + v[11] * xp[1] + v[18] * xp[2] + v[25] * xp[3] + v[32] * xp[4] + v[39] * xp[5] + v[46] * xp[6];
414       x5 += v[5] * xp[0] + v[12] * xp[1] + v[19] * xp[2] + v[26] * xp[3] + v[33] * xp[4] + v[40] * xp[5] + v[47] * xp[6];
415       x6 += v[6] * xp[0] + v[13] * xp[1] + v[20] * xp[2] + v[27] * xp[3] + v[34] * xp[4] + v[41] * xp[5] + v[48] * xp[6];
416       vj++;
417       v += 49;
418     }
419     xp    = x + k * 7;
420     xp[0] = x0;
421     xp[1] = x1;
422     xp[2] = x2;
423     xp[3] = x3;
424     xp[4] = x4;
425     xp[5] = x5;
426     xp[6] = x6;
427   }
428   PetscFunctionReturn(PETSC_SUCCESS);
429 }
430 
431 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
432 {
433   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
434   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
435   const MatScalar   *aa = a->a;
436   PetscScalar       *x;
437   const PetscScalar *b;
438 
439   PetscFunctionBegin;
440   PetscCall(VecGetArrayRead(bb, &b));
441   PetscCall(VecGetArray(xx, &x));
442 
443   /* solve U^T * D * y = b by forward substitution */
444   PetscCall(PetscArraycpy(x, b, 7 * mbs)); /* x <- b */
445   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
446 
447   /* solve U*x = y by back substitution */
448   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
449 
450   PetscCall(VecRestoreArrayRead(bb, &b));
451   PetscCall(VecRestoreArray(xx, &x));
452   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
453   PetscFunctionReturn(PETSC_SUCCESS);
454 }
455 
456 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
457 {
458   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
459   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
460   const MatScalar   *aa = a->a;
461   PetscScalar       *x;
462   const PetscScalar *b;
463 
464   PetscFunctionBegin;
465   PetscCall(VecGetArrayRead(bb, &b));
466   PetscCall(VecGetArray(xx, &x));
467   PetscCall(PetscArraycpy(x, b, 7 * mbs));
468   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
469   PetscCall(VecRestoreArrayRead(bb, &b));
470   PetscCall(VecRestoreArray(xx, &x));
471   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
472   PetscFunctionReturn(PETSC_SUCCESS);
473 }
474 
475 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
476 {
477   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
478   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
479   const MatScalar   *aa = a->a;
480   PetscScalar       *x;
481   const PetscScalar *b;
482 
483   PetscFunctionBegin;
484   PetscCall(VecGetArrayRead(bb, &b));
485   PetscCall(VecGetArray(xx, &x));
486   PetscCall(PetscArraycpy(x, b, 7 * mbs));
487   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai, aj, aa, mbs, x));
488   PetscCall(VecRestoreArrayRead(bb, &b));
489   PetscCall(VecRestoreArray(xx, &x));
490   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
491   PetscFunctionReturn(PETSC_SUCCESS);
492 }
493 
494 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
495 {
496   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
497   IS                 isrow = a->row;
498   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *r, *vj;
499   PetscInt           nz, k, idx;
500   const MatScalar   *aa = a->a, *v, *d;
501   PetscScalar       *x, x0, x1, x2, x3, x4, x5, *t, *tp;
502   const PetscScalar *b;
503 
504   PetscFunctionBegin;
505   PetscCall(VecGetArrayRead(bb, &b));
506   PetscCall(VecGetArray(xx, &x));
507   t = a->solve_work;
508   PetscCall(ISGetIndices(isrow, &r));
509 
510   /* solve U^T * D * y = b by forward substitution */
511   tp = t;
512   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
513     idx   = 6 * r[k];
514     tp[0] = b[idx];
515     tp[1] = b[idx + 1];
516     tp[2] = b[idx + 2];
517     tp[3] = b[idx + 3];
518     tp[4] = b[idx + 4];
519     tp[5] = b[idx + 5];
520     tp += 6;
521   }
522 
523   for (k = 0; k < mbs; k++) {
524     v  = aa + 36 * ai[k];
525     vj = aj + ai[k];
526     tp = t + k * 6;
527     x0 = tp[0];
528     x1 = tp[1];
529     x2 = tp[2];
530     x3 = tp[3];
531     x4 = tp[4];
532     x5 = tp[5];
533     nz = ai[k + 1] - ai[k];
534     tp = t + (*vj) * 6;
535     while (nz--) {
536       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
537       tp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
538       tp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
539       tp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
540       tp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
541       tp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
542       vj++;
543       tp = t + (*vj) * 6;
544       v += 36;
545     }
546 
547     /* xk = inv(Dk)*(Dk*xk) */
548     d     = aa + k * 36; /* ptr to inv(Dk) */
549     tp    = t + k * 6;
550     tp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
551     tp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
552     tp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
553     tp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
554     tp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
555     tp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
556   }
557 
558   /* solve U*x = y by back substitution */
559   for (k = mbs - 1; k >= 0; k--) {
560     v  = aa + 36 * ai[k];
561     vj = aj + ai[k];
562     tp = t + k * 6;
563     x0 = tp[0];
564     x1 = tp[1];
565     x2 = tp[2];
566     x3 = tp[3];
567     x4 = tp[4];
568     x5 = tp[5]; /* xk */
569     nz = ai[k + 1] - ai[k];
570 
571     tp = t + (*vj) * 6;
572     while (nz--) {
573       /* xk += U(k,:)*x(:) */
574       x0 += v[0] * tp[0] + v[6] * tp[1] + v[12] * tp[2] + v[18] * tp[3] + v[24] * tp[4] + v[30] * tp[5];
575       x1 += v[1] * tp[0] + v[7] * tp[1] + v[13] * tp[2] + v[19] * tp[3] + v[25] * tp[4] + v[31] * tp[5];
576       x2 += v[2] * tp[0] + v[8] * tp[1] + v[14] * tp[2] + v[20] * tp[3] + v[26] * tp[4] + v[32] * tp[5];
577       x3 += v[3] * tp[0] + v[9] * tp[1] + v[15] * tp[2] + v[21] * tp[3] + v[27] * tp[4] + v[33] * tp[5];
578       x4 += v[4] * tp[0] + v[10] * tp[1] + v[16] * tp[2] + v[22] * tp[3] + v[28] * tp[4] + v[34] * tp[5];
579       x5 += v[5] * tp[0] + v[11] * tp[1] + v[17] * tp[2] + v[23] * tp[3] + v[29] * tp[4] + v[35] * tp[5];
580       vj++;
581       tp = t + (*vj) * 6;
582       v += 36;
583     }
584     tp         = t + k * 6;
585     tp[0]      = x0;
586     tp[1]      = x1;
587     tp[2]      = x2;
588     tp[3]      = x3;
589     tp[4]      = x4;
590     tp[5]      = x5;
591     idx        = 6 * r[k];
592     x[idx]     = x0;
593     x[idx + 1] = x1;
594     x[idx + 2] = x2;
595     x[idx + 3] = x3;
596     x[idx + 4] = x4;
597     x[idx + 5] = x5;
598   }
599 
600   PetscCall(ISRestoreIndices(isrow, &r));
601   PetscCall(VecRestoreArrayRead(bb, &b));
602   PetscCall(VecRestoreArray(xx, &x));
603   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
604   PetscFunctionReturn(PETSC_SUCCESS);
605 }
606 
607 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
608 {
609   const MatScalar *v, *d;
610   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
611   PetscInt         nz, k;
612   const PetscInt  *vj;
613 
614   PetscFunctionBegin;
615   for (k = 0; k < mbs; k++) {
616     v  = aa + 36 * ai[k];
617     xp = x + k * 6;
618     x0 = xp[0];
619     x1 = xp[1];
620     x2 = xp[2];
621     x3 = xp[3];
622     x4 = xp[4];
623     x5 = xp[5]; /* Dk*xk = k-th block of x */
624     nz = ai[k + 1] - ai[k];
625     vj = aj + ai[k];
626     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
627     PetscPrefetchBlock(v + 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
628     while (nz--) {
629       xp = x + (*vj) * 6;
630       /* x(:) += U(k,:)^T*(Dk*xk) */
631       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4 + v[5] * x5;
632       xp[1] += v[6] * x0 + v[7] * x1 + v[8] * x2 + v[9] * x3 + v[10] * x4 + v[11] * x5;
633       xp[2] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3 + v[16] * x4 + v[17] * x5;
634       xp[3] += v[18] * x0 + v[19] * x1 + v[20] * x2 + v[21] * x3 + v[22] * x4 + v[23] * x5;
635       xp[4] += v[24] * x0 + v[25] * x1 + v[26] * x2 + v[27] * x3 + v[28] * x4 + v[29] * x5;
636       xp[5] += v[30] * x0 + v[31] * x1 + v[32] * x2 + v[33] * x3 + v[34] * x4 + v[35] * x5;
637       vj++;
638       v += 36;
639     }
640     /* xk = inv(Dk)*(Dk*xk) */
641     d     = aa + k * 36; /* ptr to inv(Dk) */
642     xp    = x + k * 6;
643     xp[0] = d[0] * x0 + d[6] * x1 + d[12] * x2 + d[18] * x3 + d[24] * x4 + d[30] * x5;
644     xp[1] = d[1] * x0 + d[7] * x1 + d[13] * x2 + d[19] * x3 + d[25] * x4 + d[31] * x5;
645     xp[2] = d[2] * x0 + d[8] * x1 + d[14] * x2 + d[20] * x3 + d[26] * x4 + d[32] * x5;
646     xp[3] = d[3] * x0 + d[9] * x1 + d[15] * x2 + d[21] * x3 + d[27] * x4 + d[33] * x5;
647     xp[4] = d[4] * x0 + d[10] * x1 + d[16] * x2 + d[22] * x3 + d[28] * x4 + d[34] * x5;
648     xp[5] = d[5] * x0 + d[11] * x1 + d[17] * x2 + d[23] * x3 + d[29] * x4 + d[35] * x5;
649   }
650   PetscFunctionReturn(PETSC_SUCCESS);
651 }
652 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
653 {
654   const MatScalar *v;
655   PetscScalar     *xp, x0, x1, x2, x3, x4, x5;
656   PetscInt         nz, k;
657   const PetscInt  *vj;
658 
659   PetscFunctionBegin;
660   for (k = mbs - 1; k >= 0; k--) {
661     v  = aa + 36 * ai[k];
662     xp = x + k * 6;
663     x0 = xp[0];
664     x1 = xp[1];
665     x2 = xp[2];
666     x3 = xp[3];
667     x4 = xp[4];
668     x5 = xp[5]; /* xk */
669     nz = ai[k + 1] - ai[k];
670     vj = aj + ai[k];
671     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
672     PetscPrefetchBlock(v - 36 * nz, 36 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
673     while (nz--) {
674       xp = x + (*vj) * 6;
675       /* xk += U(k,:)*x(:) */
676       x0 += v[0] * xp[0] + v[6] * xp[1] + v[12] * xp[2] + v[18] * xp[3] + v[24] * xp[4] + v[30] * xp[5];
677       x1 += v[1] * xp[0] + v[7] * xp[1] + v[13] * xp[2] + v[19] * xp[3] + v[25] * xp[4] + v[31] * xp[5];
678       x2 += v[2] * xp[0] + v[8] * xp[1] + v[14] * xp[2] + v[20] * xp[3] + v[26] * xp[4] + v[32] * xp[5];
679       x3 += v[3] * xp[0] + v[9] * xp[1] + v[15] * xp[2] + v[21] * xp[3] + v[27] * xp[4] + v[33] * xp[5];
680       x4 += v[4] * xp[0] + v[10] * xp[1] + v[16] * xp[2] + v[22] * xp[3] + v[28] * xp[4] + v[34] * xp[5];
681       x5 += v[5] * xp[0] + v[11] * xp[1] + v[17] * xp[2] + v[23] * xp[3] + v[29] * xp[4] + v[35] * xp[5];
682       vj++;
683       v += 36;
684     }
685     xp    = x + k * 6;
686     xp[0] = x0;
687     xp[1] = x1;
688     xp[2] = x2;
689     xp[3] = x3;
690     xp[4] = x4;
691     xp[5] = x5;
692   }
693   PetscFunctionReturn(PETSC_SUCCESS);
694 }
695 
696 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
697 {
698   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
699   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
700   const MatScalar   *aa = a->a;
701   PetscScalar       *x;
702   const PetscScalar *b;
703 
704   PetscFunctionBegin;
705   PetscCall(VecGetArrayRead(bb, &b));
706   PetscCall(VecGetArray(xx, &x));
707 
708   /* solve U^T * D * y = b by forward substitution */
709   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
710   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
711 
712   /* solve U*x = y by back substitution */
713   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
714 
715   PetscCall(VecRestoreArrayRead(bb, &b));
716   PetscCall(VecRestoreArray(xx, &x));
717   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
718   PetscFunctionReturn(PETSC_SUCCESS);
719 }
720 
721 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
722 {
723   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
724   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
725   const MatScalar   *aa = a->a;
726   PetscScalar       *x;
727   const PetscScalar *b;
728 
729   PetscFunctionBegin;
730   PetscCall(VecGetArrayRead(bb, &b));
731   PetscCall(VecGetArray(xx, &x));
732   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
733   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
734   PetscCall(VecRestoreArrayRead(bb, &b));
735   PetscCall(VecRestoreArray(xx, &x));
736   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
737   PetscFunctionReturn(PETSC_SUCCESS);
738 }
739 
740 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
741 {
742   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
743   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
744   const MatScalar   *aa = a->a;
745   PetscScalar       *x;
746   const PetscScalar *b;
747 
748   PetscFunctionBegin;
749   PetscCall(VecGetArrayRead(bb, &b));
750   PetscCall(VecGetArray(xx, &x));
751   PetscCall(PetscArraycpy(x, b, 6 * mbs)); /* x <- b */
752   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai, aj, aa, mbs, x));
753   PetscCall(VecRestoreArrayRead(bb, &b));
754   PetscCall(VecRestoreArray(xx, &x));
755   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
756   PetscFunctionReturn(PETSC_SUCCESS);
757 }
758 
759 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
760 {
761   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
762   IS                 isrow = a->row;
763   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
764   const PetscInt    *r, *vj;
765   PetscInt           nz, k, idx;
766   const MatScalar   *aa = a->a, *v, *diag;
767   PetscScalar       *x, x0, x1, x2, x3, x4, *t, *tp;
768   const PetscScalar *b;
769 
770   PetscFunctionBegin;
771   PetscCall(VecGetArrayRead(bb, &b));
772   PetscCall(VecGetArray(xx, &x));
773   t = a->solve_work;
774   PetscCall(ISGetIndices(isrow, &r));
775 
776   /* solve U^T * D * y = b by forward substitution */
777   tp = t;
778   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
779     idx   = 5 * r[k];
780     tp[0] = b[idx];
781     tp[1] = b[idx + 1];
782     tp[2] = b[idx + 2];
783     tp[3] = b[idx + 3];
784     tp[4] = b[idx + 4];
785     tp += 5;
786   }
787 
788   for (k = 0; k < mbs; k++) {
789     v  = aa + 25 * ai[k];
790     vj = aj + ai[k];
791     tp = t + k * 5;
792     x0 = tp[0];
793     x1 = tp[1];
794     x2 = tp[2];
795     x3 = tp[3];
796     x4 = tp[4];
797     nz = ai[k + 1] - ai[k];
798 
799     tp = t + (*vj) * 5;
800     while (nz--) {
801       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
802       tp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
803       tp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
804       tp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
805       tp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
806       vj++;
807       tp = t + (*vj) * 5;
808       v += 25;
809     }
810 
811     /* xk = inv(Dk)*(Dk*xk) */
812     diag  = aa + k * 25; /* ptr to inv(Dk) */
813     tp    = t + k * 5;
814     tp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
815     tp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
816     tp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
817     tp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
818     tp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
819   }
820 
821   /* solve U*x = y by back substitution */
822   for (k = mbs - 1; k >= 0; k--) {
823     v  = aa + 25 * ai[k];
824     vj = aj + ai[k];
825     tp = t + k * 5;
826     x0 = tp[0];
827     x1 = tp[1];
828     x2 = tp[2];
829     x3 = tp[3];
830     x4 = tp[4]; /* xk */
831     nz = ai[k + 1] - ai[k];
832 
833     tp = t + (*vj) * 5;
834     while (nz--) {
835       /* xk += U(k,:)*x(:) */
836       x0 += v[0] * tp[0] + v[5] * tp[1] + v[10] * tp[2] + v[15] * tp[3] + v[20] * tp[4];
837       x1 += v[1] * tp[0] + v[6] * tp[1] + v[11] * tp[2] + v[16] * tp[3] + v[21] * tp[4];
838       x2 += v[2] * tp[0] + v[7] * tp[1] + v[12] * tp[2] + v[17] * tp[3] + v[22] * tp[4];
839       x3 += v[3] * tp[0] + v[8] * tp[1] + v[13] * tp[2] + v[18] * tp[3] + v[23] * tp[4];
840       x4 += v[4] * tp[0] + v[9] * tp[1] + v[14] * tp[2] + v[19] * tp[3] + v[24] * tp[4];
841       vj++;
842       tp = t + (*vj) * 5;
843       v += 25;
844     }
845     tp         = t + k * 5;
846     tp[0]      = x0;
847     tp[1]      = x1;
848     tp[2]      = x2;
849     tp[3]      = x3;
850     tp[4]      = x4;
851     idx        = 5 * r[k];
852     x[idx]     = x0;
853     x[idx + 1] = x1;
854     x[idx + 2] = x2;
855     x[idx + 3] = x3;
856     x[idx + 4] = x4;
857   }
858 
859   PetscCall(ISRestoreIndices(isrow, &r));
860   PetscCall(VecRestoreArrayRead(bb, &b));
861   PetscCall(VecRestoreArray(xx, &x));
862   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
863   PetscFunctionReturn(PETSC_SUCCESS);
864 }
865 
866 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
867 {
868   const MatScalar *v, *diag;
869   PetscScalar     *xp, x0, x1, x2, x3, x4;
870   PetscInt         nz, k;
871   const PetscInt  *vj;
872 
873   PetscFunctionBegin;
874   for (k = 0; k < mbs; k++) {
875     v  = aa + 25 * ai[k];
876     xp = x + k * 5;
877     x0 = xp[0];
878     x1 = xp[1];
879     x2 = xp[2];
880     x3 = xp[3];
881     x4 = xp[4]; /* Dk*xk = k-th block of x */
882     nz = ai[k + 1] - ai[k];
883     vj = aj + ai[k];
884     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
885     PetscPrefetchBlock(v + 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
886     while (nz--) {
887       xp = x + (*vj) * 5;
888       /* x(:) += U(k,:)^T*(Dk*xk) */
889       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3 + v[4] * x4;
890       xp[1] += v[5] * x0 + v[6] * x1 + v[7] * x2 + v[8] * x3 + v[9] * x4;
891       xp[2] += v[10] * x0 + v[11] * x1 + v[12] * x2 + v[13] * x3 + v[14] * x4;
892       xp[3] += v[15] * x0 + v[16] * x1 + v[17] * x2 + v[18] * x3 + v[19] * x4;
893       xp[4] += v[20] * x0 + v[21] * x1 + v[22] * x2 + v[23] * x3 + v[24] * x4;
894       vj++;
895       v += 25;
896     }
897     /* xk = inv(Dk)*(Dk*xk) */
898     diag  = aa + k * 25; /* ptr to inv(Dk) */
899     xp    = x + k * 5;
900     xp[0] = diag[0] * x0 + diag[5] * x1 + diag[10] * x2 + diag[15] * x3 + diag[20] * x4;
901     xp[1] = diag[1] * x0 + diag[6] * x1 + diag[11] * x2 + diag[16] * x3 + diag[21] * x4;
902     xp[2] = diag[2] * x0 + diag[7] * x1 + diag[12] * x2 + diag[17] * x3 + diag[22] * x4;
903     xp[3] = diag[3] * x0 + diag[8] * x1 + diag[13] * x2 + diag[18] * x3 + diag[23] * x4;
904     xp[4] = diag[4] * x0 + diag[9] * x1 + diag[14] * x2 + diag[19] * x3 + diag[24] * x4;
905   }
906   PetscFunctionReturn(PETSC_SUCCESS);
907 }
908 
909 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
910 {
911   const MatScalar *v;
912   PetscScalar     *xp, x0, x1, x2, x3, x4;
913   PetscInt         nz, k;
914   const PetscInt  *vj;
915 
916   PetscFunctionBegin;
917   for (k = mbs - 1; k >= 0; k--) {
918     v  = aa + 25 * ai[k];
919     xp = x + k * 5;
920     x0 = xp[0];
921     x1 = xp[1];
922     x2 = xp[2];
923     x3 = xp[3];
924     x4 = xp[4]; /* xk */
925     nz = ai[k + 1] - ai[k];
926     vj = aj + ai[k];
927     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
928     PetscPrefetchBlock(v - 25 * nz, 25 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
929     while (nz--) {
930       xp = x + (*vj) * 5;
931       /* xk += U(k,:)*x(:) */
932       x0 += v[0] * xp[0] + v[5] * xp[1] + v[10] * xp[2] + v[15] * xp[3] + v[20] * xp[4];
933       x1 += v[1] * xp[0] + v[6] * xp[1] + v[11] * xp[2] + v[16] * xp[3] + v[21] * xp[4];
934       x2 += v[2] * xp[0] + v[7] * xp[1] + v[12] * xp[2] + v[17] * xp[3] + v[22] * xp[4];
935       x3 += v[3] * xp[0] + v[8] * xp[1] + v[13] * xp[2] + v[18] * xp[3] + v[23] * xp[4];
936       x4 += v[4] * xp[0] + v[9] * xp[1] + v[14] * xp[2] + v[19] * xp[3] + v[24] * xp[4];
937       vj++;
938       v += 25;
939     }
940     xp    = x + k * 5;
941     xp[0] = x0;
942     xp[1] = x1;
943     xp[2] = x2;
944     xp[3] = x3;
945     xp[4] = x4;
946   }
947   PetscFunctionReturn(PETSC_SUCCESS);
948 }
949 
950 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
951 {
952   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
953   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
954   const MatScalar   *aa = a->a;
955   PetscScalar       *x;
956   const PetscScalar *b;
957 
958   PetscFunctionBegin;
959   PetscCall(VecGetArrayRead(bb, &b));
960   PetscCall(VecGetArray(xx, &x));
961 
962   /* solve U^T * D * y = b by forward substitution */
963   PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
964   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
965 
966   /* solve U*x = y by back substitution */
967   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
968 
969   PetscCall(VecRestoreArrayRead(bb, &b));
970   PetscCall(VecRestoreArray(xx, &x));
971   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
972   PetscFunctionReturn(PETSC_SUCCESS);
973 }
974 
975 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
976 {
977   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
978   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
979   const MatScalar   *aa = a->a;
980   PetscScalar       *x;
981   const PetscScalar *b;
982 
983   PetscFunctionBegin;
984   PetscCall(VecGetArrayRead(bb, &b));
985   PetscCall(VecGetArray(xx, &x));
986   PetscCall(PetscArraycpy(x, b, 5 * mbs)); /* x <- b */
987   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
988   PetscCall(VecRestoreArrayRead(bb, &b));
989   PetscCall(VecRestoreArray(xx, &x));
990   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
991   PetscFunctionReturn(PETSC_SUCCESS);
992 }
993 
994 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
995 {
996   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
997   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
998   const MatScalar   *aa = a->a;
999   PetscScalar       *x;
1000   const PetscScalar *b;
1001 
1002   PetscFunctionBegin;
1003   PetscCall(VecGetArrayRead(bb, &b));
1004   PetscCall(VecGetArray(xx, &x));
1005   PetscCall(PetscArraycpy(x, b, 5 * mbs));
1006   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai, aj, aa, mbs, x));
1007   PetscCall(VecRestoreArrayRead(bb, &b));
1008   PetscCall(VecRestoreArray(xx, &x));
1009   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1010   PetscFunctionReturn(PETSC_SUCCESS);
1011 }
1012 
1013 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
1014 {
1015   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1016   IS                 isrow = a->row;
1017   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1018   const PetscInt    *r, *vj;
1019   PetscInt           nz, k, idx;
1020   const MatScalar   *aa = a->a, *v, *diag;
1021   PetscScalar       *x, x0, x1, x2, x3, *t, *tp;
1022   const PetscScalar *b;
1023 
1024   PetscFunctionBegin;
1025   PetscCall(VecGetArrayRead(bb, &b));
1026   PetscCall(VecGetArray(xx, &x));
1027   t = a->solve_work;
1028   PetscCall(ISGetIndices(isrow, &r));
1029 
1030   /* solve U^T * D * y = b by forward substitution */
1031   tp = t;
1032   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1033     idx   = 4 * r[k];
1034     tp[0] = b[idx];
1035     tp[1] = b[idx + 1];
1036     tp[2] = b[idx + 2];
1037     tp[3] = b[idx + 3];
1038     tp += 4;
1039   }
1040 
1041   for (k = 0; k < mbs; k++) {
1042     v  = aa + 16 * ai[k];
1043     vj = aj + ai[k];
1044     tp = t + k * 4;
1045     x0 = tp[0];
1046     x1 = tp[1];
1047     x2 = tp[2];
1048     x3 = tp[3];
1049     nz = ai[k + 1] - ai[k];
1050 
1051     tp = t + (*vj) * 4;
1052     while (nz--) {
1053       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1054       tp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1055       tp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1056       tp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1057       vj++;
1058       tp = t + (*vj) * 4;
1059       v += 16;
1060     }
1061 
1062     /* xk = inv(Dk)*(Dk*xk) */
1063     diag  = aa + k * 16; /* ptr to inv(Dk) */
1064     tp    = t + k * 4;
1065     tp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1066     tp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1067     tp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1068     tp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1069   }
1070 
1071   /* solve U*x = y by back substitution */
1072   for (k = mbs - 1; k >= 0; k--) {
1073     v  = aa + 16 * ai[k];
1074     vj = aj + ai[k];
1075     tp = t + k * 4;
1076     x0 = tp[0];
1077     x1 = tp[1];
1078     x2 = tp[2];
1079     x3 = tp[3]; /* xk */
1080     nz = ai[k + 1] - ai[k];
1081 
1082     tp = t + (*vj) * 4;
1083     while (nz--) {
1084       /* xk += U(k,:)*x(:) */
1085       x0 += v[0] * tp[0] + v[4] * tp[1] + v[8] * tp[2] + v[12] * tp[3];
1086       x1 += v[1] * tp[0] + v[5] * tp[1] + v[9] * tp[2] + v[13] * tp[3];
1087       x2 += v[2] * tp[0] + v[6] * tp[1] + v[10] * tp[2] + v[14] * tp[3];
1088       x3 += v[3] * tp[0] + v[7] * tp[1] + v[11] * tp[2] + v[15] * tp[3];
1089       vj++;
1090       tp = t + (*vj) * 4;
1091       v += 16;
1092     }
1093     tp         = t + k * 4;
1094     tp[0]      = x0;
1095     tp[1]      = x1;
1096     tp[2]      = x2;
1097     tp[3]      = x3;
1098     idx        = 4 * r[k];
1099     x[idx]     = x0;
1100     x[idx + 1] = x1;
1101     x[idx + 2] = x2;
1102     x[idx + 3] = x3;
1103   }
1104 
1105   PetscCall(ISRestoreIndices(isrow, &r));
1106   PetscCall(VecRestoreArrayRead(bb, &b));
1107   PetscCall(VecRestoreArray(xx, &x));
1108   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1109   PetscFunctionReturn(PETSC_SUCCESS);
1110 }
1111 
1112 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1113 {
1114   const MatScalar *v, *diag;
1115   PetscScalar     *xp, x0, x1, x2, x3;
1116   PetscInt         nz, k;
1117   const PetscInt  *vj;
1118 
1119   PetscFunctionBegin;
1120   for (k = 0; k < mbs; k++) {
1121     v  = aa + 16 * ai[k];
1122     xp = x + k * 4;
1123     x0 = xp[0];
1124     x1 = xp[1];
1125     x2 = xp[2];
1126     x3 = xp[3]; /* Dk*xk = k-th block of x */
1127     nz = ai[k + 1] - ai[k];
1128     vj = aj + ai[k];
1129     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1130     PetscPrefetchBlock(v + 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1131     while (nz--) {
1132       xp = x + (*vj) * 4;
1133       /* x(:) += U(k,:)^T*(Dk*xk) */
1134       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2 + v[3] * x3;
1135       xp[1] += v[4] * x0 + v[5] * x1 + v[6] * x2 + v[7] * x3;
1136       xp[2] += v[8] * x0 + v[9] * x1 + v[10] * x2 + v[11] * x3;
1137       xp[3] += v[12] * x0 + v[13] * x1 + v[14] * x2 + v[15] * x3;
1138       vj++;
1139       v += 16;
1140     }
1141     /* xk = inv(Dk)*(Dk*xk) */
1142     diag  = aa + k * 16; /* ptr to inv(Dk) */
1143     xp    = x + k * 4;
1144     xp[0] = diag[0] * x0 + diag[4] * x1 + diag[8] * x2 + diag[12] * x3;
1145     xp[1] = diag[1] * x0 + diag[5] * x1 + diag[9] * x2 + diag[13] * x3;
1146     xp[2] = diag[2] * x0 + diag[6] * x1 + diag[10] * x2 + diag[14] * x3;
1147     xp[3] = diag[3] * x0 + diag[7] * x1 + diag[11] * x2 + diag[15] * x3;
1148   }
1149   PetscFunctionReturn(PETSC_SUCCESS);
1150 }
1151 
1152 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1153 {
1154   const MatScalar *v;
1155   PetscScalar     *xp, x0, x1, x2, x3;
1156   PetscInt         nz, k;
1157   const PetscInt  *vj;
1158 
1159   PetscFunctionBegin;
1160   for (k = mbs - 1; k >= 0; k--) {
1161     v  = aa + 16 * ai[k];
1162     xp = x + k * 4;
1163     x0 = xp[0];
1164     x1 = xp[1];
1165     x2 = xp[2];
1166     x3 = xp[3]; /* xk */
1167     nz = ai[k + 1] - ai[k];
1168     vj = aj + ai[k];
1169     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);          /* Indices for the next row (assumes same size as this one) */
1170     PetscPrefetchBlock(v - 16 * nz, 16 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1171     while (nz--) {
1172       xp = x + (*vj) * 4;
1173       /* xk += U(k,:)*x(:) */
1174       x0 += v[0] * xp[0] + v[4] * xp[1] + v[8] * xp[2] + v[12] * xp[3];
1175       x1 += v[1] * xp[0] + v[5] * xp[1] + v[9] * xp[2] + v[13] * xp[3];
1176       x2 += v[2] * xp[0] + v[6] * xp[1] + v[10] * xp[2] + v[14] * xp[3];
1177       x3 += v[3] * xp[0] + v[7] * xp[1] + v[11] * xp[2] + v[15] * xp[3];
1178       vj++;
1179       v += 16;
1180     }
1181     xp    = x + k * 4;
1182     xp[0] = x0;
1183     xp[1] = x1;
1184     xp[2] = x2;
1185     xp[3] = x3;
1186   }
1187   PetscFunctionReturn(PETSC_SUCCESS);
1188 }
1189 
1190 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1191 {
1192   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1193   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1194   const MatScalar   *aa = a->a;
1195   PetscScalar       *x;
1196   const PetscScalar *b;
1197 
1198   PetscFunctionBegin;
1199   PetscCall(VecGetArrayRead(bb, &b));
1200   PetscCall(VecGetArray(xx, &x));
1201 
1202   /* solve U^T * D * y = b by forward substitution */
1203   PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1204   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1205 
1206   /* solve U*x = y by back substitution */
1207   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1208   PetscCall(VecRestoreArrayRead(bb, &b));
1209   PetscCall(VecRestoreArray(xx, &x));
1210   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1211   PetscFunctionReturn(PETSC_SUCCESS);
1212 }
1213 
1214 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1215 {
1216   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1217   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1218   const MatScalar   *aa = a->a;
1219   PetscScalar       *x;
1220   const PetscScalar *b;
1221 
1222   PetscFunctionBegin;
1223   PetscCall(VecGetArrayRead(bb, &b));
1224   PetscCall(VecGetArray(xx, &x));
1225   PetscCall(PetscArraycpy(x, b, 4 * mbs)); /* x <- b */
1226   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1227   PetscCall(VecRestoreArrayRead(bb, &b));
1228   PetscCall(VecRestoreArray(xx, &x));
1229   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1230   PetscFunctionReturn(PETSC_SUCCESS);
1231 }
1232 
1233 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1234 {
1235   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1236   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1237   const MatScalar   *aa = a->a;
1238   PetscScalar       *x;
1239   const PetscScalar *b;
1240 
1241   PetscFunctionBegin;
1242   PetscCall(VecGetArrayRead(bb, &b));
1243   PetscCall(VecGetArray(xx, &x));
1244   PetscCall(PetscArraycpy(x, b, 4 * mbs));
1245   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai, aj, aa, mbs, x));
1246   PetscCall(VecRestoreArrayRead(bb, &b));
1247   PetscCall(VecRestoreArray(xx, &x));
1248   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1249   PetscFunctionReturn(PETSC_SUCCESS);
1250 }
1251 
1252 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1253 {
1254   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1255   IS                 isrow = a->row;
1256   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1257   const PetscInt    *r;
1258   PetscInt           nz, k, idx;
1259   const PetscInt    *vj;
1260   const MatScalar   *aa = a->a, *v, *diag;
1261   PetscScalar       *x, x0, x1, x2, *t, *tp;
1262   const PetscScalar *b;
1263 
1264   PetscFunctionBegin;
1265   PetscCall(VecGetArrayRead(bb, &b));
1266   PetscCall(VecGetArray(xx, &x));
1267   t = a->solve_work;
1268   PetscCall(ISGetIndices(isrow, &r));
1269 
1270   /* solve U^T * D * y = b by forward substitution */
1271   tp = t;
1272   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1273     idx   = 3 * r[k];
1274     tp[0] = b[idx];
1275     tp[1] = b[idx + 1];
1276     tp[2] = b[idx + 2];
1277     tp += 3;
1278   }
1279 
1280   for (k = 0; k < mbs; k++) {
1281     v  = aa + 9 * ai[k];
1282     vj = aj + ai[k];
1283     tp = t + k * 3;
1284     x0 = tp[0];
1285     x1 = tp[1];
1286     x2 = tp[2];
1287     nz = ai[k + 1] - ai[k];
1288 
1289     tp = t + (*vj) * 3;
1290     while (nz--) {
1291       tp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1292       tp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1293       tp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1294       vj++;
1295       tp = t + (*vj) * 3;
1296       v += 9;
1297     }
1298 
1299     /* xk = inv(Dk)*(Dk*xk) */
1300     diag  = aa + k * 9; /* ptr to inv(Dk) */
1301     tp    = t + k * 3;
1302     tp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1303     tp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1304     tp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1305   }
1306 
1307   /* solve U*x = y by back substitution */
1308   for (k = mbs - 1; k >= 0; k--) {
1309     v  = aa + 9 * ai[k];
1310     vj = aj + ai[k];
1311     tp = t + k * 3;
1312     x0 = tp[0];
1313     x1 = tp[1];
1314     x2 = tp[2]; /* xk */
1315     nz = ai[k + 1] - ai[k];
1316 
1317     tp = t + (*vj) * 3;
1318     while (nz--) {
1319       /* xk += U(k,:)*x(:) */
1320       x0 += v[0] * tp[0] + v[3] * tp[1] + v[6] * tp[2];
1321       x1 += v[1] * tp[0] + v[4] * tp[1] + v[7] * tp[2];
1322       x2 += v[2] * tp[0] + v[5] * tp[1] + v[8] * tp[2];
1323       vj++;
1324       tp = t + (*vj) * 3;
1325       v += 9;
1326     }
1327     tp         = t + k * 3;
1328     tp[0]      = x0;
1329     tp[1]      = x1;
1330     tp[2]      = x2;
1331     idx        = 3 * r[k];
1332     x[idx]     = x0;
1333     x[idx + 1] = x1;
1334     x[idx + 2] = x2;
1335   }
1336 
1337   PetscCall(ISRestoreIndices(isrow, &r));
1338   PetscCall(VecRestoreArrayRead(bb, &b));
1339   PetscCall(VecRestoreArray(xx, &x));
1340   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1341   PetscFunctionReturn(PETSC_SUCCESS);
1342 }
1343 
1344 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1345 {
1346   const MatScalar *v, *diag;
1347   PetscScalar     *xp, x0, x1, x2;
1348   PetscInt         nz, k;
1349   const PetscInt  *vj;
1350 
1351   PetscFunctionBegin;
1352   for (k = 0; k < mbs; k++) {
1353     v  = aa + 9 * ai[k];
1354     xp = x + k * 3;
1355     x0 = xp[0];
1356     x1 = xp[1];
1357     x2 = xp[2]; /* Dk*xk = k-th block of x */
1358     nz = ai[k + 1] - ai[k];
1359     vj = aj + ai[k];
1360     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1361     PetscPrefetchBlock(v + 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1362     while (nz--) {
1363       xp = x + (*vj) * 3;
1364       /* x(:) += U(k,:)^T*(Dk*xk) */
1365       xp[0] += v[0] * x0 + v[1] * x1 + v[2] * x2;
1366       xp[1] += v[3] * x0 + v[4] * x1 + v[5] * x2;
1367       xp[2] += v[6] * x0 + v[7] * x1 + v[8] * x2;
1368       vj++;
1369       v += 9;
1370     }
1371     /* xk = inv(Dk)*(Dk*xk) */
1372     diag  = aa + k * 9; /* ptr to inv(Dk) */
1373     xp    = x + k * 3;
1374     xp[0] = diag[0] * x0 + diag[3] * x1 + diag[6] * x2;
1375     xp[1] = diag[1] * x0 + diag[4] * x1 + diag[7] * x2;
1376     xp[2] = diag[2] * x0 + diag[5] * x1 + diag[8] * x2;
1377   }
1378   PetscFunctionReturn(PETSC_SUCCESS);
1379 }
1380 
1381 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1382 {
1383   const MatScalar *v;
1384   PetscScalar     *xp, x0, x1, x2;
1385   PetscInt         nz, k;
1386   const PetscInt  *vj;
1387 
1388   PetscFunctionBegin;
1389   for (k = mbs - 1; k >= 0; k--) {
1390     v  = aa + 9 * ai[k];
1391     xp = x + k * 3;
1392     x0 = xp[0];
1393     x1 = xp[1];
1394     x2 = xp[2]; /* xk */
1395     nz = ai[k + 1] - ai[k];
1396     vj = aj + ai[k];
1397     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1398     PetscPrefetchBlock(v - 9 * nz, 9 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1399     while (nz--) {
1400       xp = x + (*vj) * 3;
1401       /* xk += U(k,:)*x(:) */
1402       x0 += v[0] * xp[0] + v[3] * xp[1] + v[6] * xp[2];
1403       x1 += v[1] * xp[0] + v[4] * xp[1] + v[7] * xp[2];
1404       x2 += v[2] * xp[0] + v[5] * xp[1] + v[8] * xp[2];
1405       vj++;
1406       v += 9;
1407     }
1408     xp    = x + k * 3;
1409     xp[0] = x0;
1410     xp[1] = x1;
1411     xp[2] = x2;
1412   }
1413   PetscFunctionReturn(PETSC_SUCCESS);
1414 }
1415 
1416 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1417 {
1418   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1419   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1420   const MatScalar   *aa = a->a;
1421   PetscScalar       *x;
1422   const PetscScalar *b;
1423 
1424   PetscFunctionBegin;
1425   PetscCall(VecGetArrayRead(bb, &b));
1426   PetscCall(VecGetArray(xx, &x));
1427 
1428   /* solve U^T * D * y = b by forward substitution */
1429   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1430   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1431 
1432   /* solve U*x = y by back substitution */
1433   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1434 
1435   PetscCall(VecRestoreArrayRead(bb, &b));
1436   PetscCall(VecRestoreArray(xx, &x));
1437   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1438   PetscFunctionReturn(PETSC_SUCCESS);
1439 }
1440 
1441 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1442 {
1443   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1444   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1445   const MatScalar   *aa = a->a;
1446   PetscScalar       *x;
1447   const PetscScalar *b;
1448 
1449   PetscFunctionBegin;
1450   PetscCall(VecGetArrayRead(bb, &b));
1451   PetscCall(VecGetArray(xx, &x));
1452   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1453   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1454   PetscCall(VecRestoreArrayRead(bb, &b));
1455   PetscCall(VecRestoreArray(xx, &x));
1456   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1457   PetscFunctionReturn(PETSC_SUCCESS);
1458 }
1459 
1460 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1461 {
1462   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1463   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1464   const MatScalar   *aa = a->a;
1465   PetscScalar       *x;
1466   const PetscScalar *b;
1467 
1468   PetscFunctionBegin;
1469   PetscCall(VecGetArrayRead(bb, &b));
1470   PetscCall(VecGetArray(xx, &x));
1471   PetscCall(PetscArraycpy(x, b, 3 * mbs));
1472   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai, aj, aa, mbs, x));
1473   PetscCall(VecRestoreArrayRead(bb, &b));
1474   PetscCall(VecRestoreArray(xx, &x));
1475   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1476   PetscFunctionReturn(PETSC_SUCCESS);
1477 }
1478 
1479 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1480 {
1481   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1482   IS                 isrow = a->row;
1483   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1484   const PetscInt    *r, *vj;
1485   PetscInt           nz, k, k2, idx;
1486   const MatScalar   *aa = a->a, *v, *diag;
1487   PetscScalar       *x, x0, x1, *t;
1488   const PetscScalar *b;
1489 
1490   PetscFunctionBegin;
1491   PetscCall(VecGetArrayRead(bb, &b));
1492   PetscCall(VecGetArray(xx, &x));
1493   t = a->solve_work;
1494   PetscCall(ISGetIndices(isrow, &r));
1495 
1496   /* solve U^T * D * y = perm(b) by forward substitution */
1497   for (k = 0; k < mbs; k++) { /* t <- perm(b) */
1498     idx          = 2 * r[k];
1499     t[k * 2]     = b[idx];
1500     t[k * 2 + 1] = b[idx + 1];
1501   }
1502   for (k = 0; k < mbs; k++) {
1503     v  = aa + 4 * ai[k];
1504     vj = aj + ai[k];
1505     k2 = k * 2;
1506     x0 = t[k2];
1507     x1 = t[k2 + 1];
1508     nz = ai[k + 1] - ai[k];
1509     while (nz--) {
1510       t[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1511       t[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1512       vj++;
1513       v += 4;
1514     }
1515     diag      = aa + k * 4; /* ptr to inv(Dk) */
1516     t[k2]     = diag[0] * x0 + diag[2] * x1;
1517     t[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1518   }
1519 
1520   /* solve U*x = y by back substitution */
1521   for (k = mbs - 1; k >= 0; k--) {
1522     v  = aa + 4 * ai[k];
1523     vj = aj + ai[k];
1524     k2 = k * 2;
1525     x0 = t[k2];
1526     x1 = t[k2 + 1];
1527     nz = ai[k + 1] - ai[k];
1528     while (nz--) {
1529       x0 += v[0] * t[(*vj) * 2] + v[2] * t[(*vj) * 2 + 1];
1530       x1 += v[1] * t[(*vj) * 2] + v[3] * t[(*vj) * 2 + 1];
1531       vj++;
1532       v += 4;
1533     }
1534     t[k2]      = x0;
1535     t[k2 + 1]  = x1;
1536     idx        = 2 * r[k];
1537     x[idx]     = x0;
1538     x[idx + 1] = x1;
1539   }
1540 
1541   PetscCall(ISRestoreIndices(isrow, &r));
1542   PetscCall(VecRestoreArrayRead(bb, &b));
1543   PetscCall(VecRestoreArray(xx, &x));
1544   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1545   PetscFunctionReturn(PETSC_SUCCESS);
1546 }
1547 
1548 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1549 {
1550   const MatScalar *v, *diag;
1551   PetscScalar      x0, x1;
1552   PetscInt         nz, k, k2;
1553   const PetscInt  *vj;
1554 
1555   PetscFunctionBegin;
1556   for (k = 0; k < mbs; k++) {
1557     v  = aa + 4 * ai[k];
1558     vj = aj + ai[k];
1559     k2 = k * 2;
1560     x0 = x[k2];
1561     x1 = x[k2 + 1]; /* Dk*xk = k-th block of x */
1562     nz = ai[k + 1] - ai[k];
1563     PetscPrefetchBlock(vj + nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1564     PetscPrefetchBlock(v + 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1565     while (nz--) {
1566       /* x(:) += U(k,:)^T*(Dk*xk) */
1567       x[(*vj) * 2] += v[0] * x0 + v[1] * x1;
1568       x[(*vj) * 2 + 1] += v[2] * x0 + v[3] * x1;
1569       vj++;
1570       v += 4;
1571     }
1572     /* xk = inv(Dk)*(Dk*xk) */
1573     diag      = aa + k * 4; /* ptr to inv(Dk) */
1574     x[k2]     = diag[0] * x0 + diag[2] * x1;
1575     x[k2 + 1] = diag[1] * x0 + diag[3] * x1;
1576   }
1577   PetscFunctionReturn(PETSC_SUCCESS);
1578 }
1579 
1580 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai, const PetscInt *aj, const MatScalar *aa, PetscInt mbs, PetscScalar *x)
1581 {
1582   const MatScalar *v;
1583   PetscScalar      x0, x1;
1584   PetscInt         nz, k, k2;
1585   const PetscInt  *vj;
1586 
1587   PetscFunctionBegin;
1588   for (k = mbs - 1; k >= 0; k--) {
1589     v  = aa + 4 * ai[k];
1590     vj = aj + ai[k];
1591     k2 = k * 2;
1592     x0 = x[k2];
1593     x1 = x[k2 + 1]; /* xk */
1594     nz = ai[k + 1] - ai[k];
1595     PetscPrefetchBlock(vj - nz, nz, 0, PETSC_PREFETCH_HINT_NTA);        /* Indices for the next row (assumes same size as this one) */
1596     PetscPrefetchBlock(v - 4 * nz, 4 * nz, 0, PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1597     while (nz--) {
1598       /* xk += U(k,:)*x(:) */
1599       x0 += v[0] * x[(*vj) * 2] + v[2] * x[(*vj) * 2 + 1];
1600       x1 += v[1] * x[(*vj) * 2] + v[3] * x[(*vj) * 2 + 1];
1601       vj++;
1602       v += 4;
1603     }
1604     x[k2]     = x0;
1605     x[k2 + 1] = x1;
1606   }
1607   PetscFunctionReturn(PETSC_SUCCESS);
1608 }
1609 
1610 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1611 {
1612   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1613   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1614   const MatScalar   *aa = a->a;
1615   PetscScalar       *x;
1616   const PetscScalar *b;
1617 
1618   PetscFunctionBegin;
1619   PetscCall(VecGetArrayRead(bb, &b));
1620   PetscCall(VecGetArray(xx, &x));
1621 
1622   /* solve U^T * D * y = b by forward substitution */
1623   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1624   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1625 
1626   /* solve U*x = y by back substitution */
1627   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1628 
1629   PetscCall(VecRestoreArrayRead(bb, &b));
1630   PetscCall(VecRestoreArray(xx, &x));
1631   PetscCall(PetscLogFlops(4.0 * a->bs2 * a->nz - (A->rmap->bs + 2.0 * a->bs2) * mbs));
1632   PetscFunctionReturn(PETSC_SUCCESS);
1633 }
1634 
1635 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1636 {
1637   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1638   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1639   const MatScalar   *aa = a->a;
1640   PetscScalar       *x;
1641   const PetscScalar *b;
1642 
1643   PetscFunctionBegin;
1644   PetscCall(VecGetArrayRead(bb, &b));
1645   PetscCall(VecGetArray(xx, &x));
1646   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1647   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1648   PetscCall(VecRestoreArrayRead(bb, &b));
1649   PetscCall(VecRestoreArray(xx, &x));
1650   PetscCall(PetscLogFlops(2.0 * a->bs2 * a->nz - A->rmap->bs * mbs));
1651   PetscFunctionReturn(PETSC_SUCCESS);
1652 }
1653 
1654 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
1655 {
1656   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
1657   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j;
1658   const MatScalar   *aa = a->a;
1659   PetscScalar       *x;
1660   const PetscScalar *b;
1661 
1662   PetscFunctionBegin;
1663   PetscCall(VecGetArrayRead(bb, &b));
1664   PetscCall(VecGetArray(xx, &x));
1665   PetscCall(PetscArraycpy(x, b, 2 * mbs));
1666   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai, aj, aa, mbs, x));
1667   PetscCall(VecRestoreArrayRead(bb, &b));
1668   PetscCall(VecRestoreArray(xx, &x));
1669   PetscCall(PetscLogFlops(2.0 * a->bs2 * (a->nz - mbs)));
1670   PetscFunctionReturn(PETSC_SUCCESS);
1671 }
1672 
1673 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1674 {
1675   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1676   IS                 isrow = a->row;
1677   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1678   const MatScalar   *aa = a->a, *v;
1679   const PetscScalar *b;
1680   PetscScalar       *x, xk, *t;
1681   PetscInt           nz, k, j;
1682 
1683   PetscFunctionBegin;
1684   PetscCall(VecGetArrayRead(bb, &b));
1685   PetscCall(VecGetArray(xx, &x));
1686   t = a->solve_work;
1687   PetscCall(ISGetIndices(isrow, &rp));
1688 
1689   /* solve U^T*D*y = perm(b) by forward substitution */
1690   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1691   for (k = 0; k < mbs; k++) {
1692     v  = aa + ai[k];
1693     vj = aj + ai[k];
1694     xk = t[k];
1695     nz = ai[k + 1] - ai[k] - 1;
1696     for (j = 0; j < nz; j++) t[vj[j]] += v[j] * xk;
1697     t[k] = xk * v[nz]; /* v[nz] = 1/D(k) */
1698   }
1699 
1700   /* solve U*perm(x) = y by back substitution */
1701   for (k = mbs - 1; k >= 0; k--) {
1702     v  = aa + adiag[k] - 1;
1703     vj = aj + adiag[k] - 1;
1704     nz = ai[k + 1] - ai[k] - 1;
1705     for (j = 0; j < nz; j++) t[k] += v[-j] * t[vj[-j]];
1706     x[rp[k]] = t[k];
1707   }
1708 
1709   PetscCall(ISRestoreIndices(isrow, &rp));
1710   PetscCall(VecRestoreArrayRead(bb, &b));
1711   PetscCall(VecRestoreArray(xx, &x));
1712   PetscCall(PetscLogFlops(4.0 * a->nz - 3.0 * mbs));
1713   PetscFunctionReturn(PETSC_SUCCESS);
1714 }
1715 
1716 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1717 {
1718   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1719   IS                 isrow = a->row;
1720   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1721   const MatScalar   *aa = a->a, *v;
1722   PetscScalar       *x, xk, *t;
1723   const PetscScalar *b;
1724   PetscInt           nz, k;
1725 
1726   PetscFunctionBegin;
1727   PetscCall(VecGetArrayRead(bb, &b));
1728   PetscCall(VecGetArray(xx, &x));
1729   t = a->solve_work;
1730   PetscCall(ISGetIndices(isrow, &rp));
1731 
1732   /* solve U^T*D*y = perm(b) by forward substitution */
1733   for (k = 0; k < mbs; k++) t[k] = b[rp[k]];
1734   for (k = 0; k < mbs; k++) {
1735     v  = aa + ai[k] + 1;
1736     vj = aj + ai[k] + 1;
1737     xk = t[k];
1738     nz = ai[k + 1] - ai[k] - 1;
1739     while (nz--) t[*vj++] += (*v++) * xk;
1740     t[k] = xk * aa[ai[k]]; /* aa[k] = 1/D(k) */
1741   }
1742 
1743   /* solve U*perm(x) = y by back substitution */
1744   for (k = mbs - 1; k >= 0; k--) {
1745     v  = aa + ai[k] + 1;
1746     vj = aj + ai[k] + 1;
1747     nz = ai[k + 1] - ai[k] - 1;
1748     while (nz--) t[k] += (*v++) * t[*vj++];
1749     x[rp[k]] = t[k];
1750   }
1751 
1752   PetscCall(ISRestoreIndices(isrow, &rp));
1753   PetscCall(VecRestoreArrayRead(bb, &b));
1754   PetscCall(VecRestoreArray(xx, &x));
1755   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
1756   PetscFunctionReturn(PETSC_SUCCESS);
1757 }
1758 
1759 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1760 {
1761   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1762   IS                 isrow = a->row;
1763   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1764   const MatScalar   *aa = a->a, *v;
1765   PetscReal          diagk;
1766   PetscScalar       *x, xk;
1767   const PetscScalar *b;
1768   PetscInt           nz, k;
1769 
1770   PetscFunctionBegin;
1771   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1772   PetscCall(VecGetArrayRead(bb, &b));
1773   PetscCall(VecGetArray(xx, &x));
1774   PetscCall(ISGetIndices(isrow, &rp));
1775 
1776   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1777   for (k = 0; k < mbs; k++) {
1778     v  = aa + ai[k];
1779     vj = aj + ai[k];
1780     xk = x[k];
1781     nz = ai[k + 1] - ai[k] - 1;
1782     while (nz--) x[*vj++] += (*v++) * xk;
1783 
1784     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1785     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1786     x[k] = xk * PetscSqrtReal(diagk);
1787   }
1788   PetscCall(ISRestoreIndices(isrow, &rp));
1789   PetscCall(VecRestoreArrayRead(bb, &b));
1790   PetscCall(VecRestoreArray(xx, &x));
1791   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1792   PetscFunctionReturn(PETSC_SUCCESS);
1793 }
1794 
1795 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1796 {
1797   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1798   IS                 isrow = a->row;
1799   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1800   const MatScalar   *aa = a->a, *v;
1801   PetscReal          diagk;
1802   PetscScalar       *x, xk;
1803   const PetscScalar *b;
1804   PetscInt           nz, k;
1805 
1806   PetscFunctionBegin;
1807   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1808   PetscCall(VecGetArrayRead(bb, &b));
1809   PetscCall(VecGetArray(xx, &x));
1810   PetscCall(ISGetIndices(isrow, &rp));
1811 
1812   for (k = 0; k < mbs; k++) x[k] = b[rp[k]];
1813   for (k = 0; k < mbs; k++) {
1814     v  = aa + ai[k] + 1;
1815     vj = aj + ai[k] + 1;
1816     xk = x[k];
1817     nz = ai[k + 1] - ai[k] - 1;
1818     while (nz--) x[*vj++] += (*v++) * xk;
1819 
1820     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1821     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1822     x[k] = xk * PetscSqrtReal(diagk);
1823   }
1824   PetscCall(ISRestoreIndices(isrow, &rp));
1825   PetscCall(VecRestoreArrayRead(bb, &b));
1826   PetscCall(VecRestoreArray(xx, &x));
1827   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1828   PetscFunctionReturn(PETSC_SUCCESS);
1829 }
1830 
1831 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A, Vec bb, Vec xx)
1832 {
1833   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1834   IS                 isrow = a->row;
1835   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj, *adiag = a->diag;
1836   const MatScalar   *aa = a->a, *v;
1837   PetscReal          diagk;
1838   PetscScalar       *x, *t;
1839   const PetscScalar *b;
1840   PetscInt           nz, k;
1841 
1842   PetscFunctionBegin;
1843   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1844   PetscCall(VecGetArrayRead(bb, &b));
1845   PetscCall(VecGetArray(xx, &x));
1846   t = a->solve_work;
1847   PetscCall(ISGetIndices(isrow, &rp));
1848 
1849   for (k = mbs - 1; k >= 0; k--) {
1850     v     = aa + ai[k];
1851     vj    = aj + ai[k];
1852     diagk = PetscRealPart(aa[adiag[k]]);
1853     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1854     t[k] = b[k] * PetscSqrtReal(diagk);
1855     nz   = ai[k + 1] - ai[k] - 1;
1856     while (nz--) t[k] += (*v++) * t[*vj++];
1857     x[rp[k]] = t[k];
1858   }
1859   PetscCall(ISRestoreIndices(isrow, &rp));
1860   PetscCall(VecRestoreArrayRead(bb, &b));
1861   PetscCall(VecRestoreArray(xx, &x));
1862   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1863   PetscFunctionReturn(PETSC_SUCCESS);
1864 }
1865 
1866 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1867 {
1868   Mat_SeqSBAIJ      *a     = (Mat_SeqSBAIJ *)A->data;
1869   IS                 isrow = a->row;
1870   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *rp, *vj;
1871   const MatScalar   *aa = a->a, *v;
1872   PetscReal          diagk;
1873   PetscScalar       *x, *t;
1874   const PetscScalar *b;
1875   PetscInt           nz, k;
1876 
1877   PetscFunctionBegin;
1878   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1879   PetscCall(VecGetArrayRead(bb, &b));
1880   PetscCall(VecGetArray(xx, &x));
1881   t = a->solve_work;
1882   PetscCall(ISGetIndices(isrow, &rp));
1883 
1884   for (k = mbs - 1; k >= 0; k--) {
1885     v     = aa + ai[k] + 1;
1886     vj    = aj + ai[k] + 1;
1887     diagk = PetscRealPart(aa[ai[k]]);
1888     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
1889     t[k] = b[k] * PetscSqrtReal(diagk);
1890     nz   = ai[k + 1] - ai[k] - 1;
1891     while (nz--) t[k] += (*v++) * t[*vj++];
1892     x[rp[k]] = t[k];
1893   }
1894   PetscCall(ISRestoreIndices(isrow, &rp));
1895   PetscCall(VecRestoreArrayRead(bb, &b));
1896   PetscCall(VecRestoreArray(xx, &x));
1897   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
1898   PetscFunctionReturn(PETSC_SUCCESS);
1899 }
1900 
1901 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A, Vecs bb, Vecs xx)
1902 {
1903   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1904 
1905   PetscFunctionBegin;
1906   if (A->rmap->bs == 1) {
1907     PetscCall(MatSolve_SeqSBAIJ_1(A, bb->v, xx->v));
1908   } else {
1909     IS                 isrow = a->row;
1910     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1911     const MatScalar   *aa = a->a, *v;
1912     PetscScalar       *x, *t;
1913     const PetscScalar *b;
1914     PetscInt           nz, k, n, i, j;
1915 
1916     if (bb->n > a->solves_work_n) {
1917       PetscCall(PetscFree(a->solves_work));
1918       PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1919       a->solves_work_n = bb->n;
1920     }
1921     n = bb->n;
1922     PetscCall(VecGetArrayRead(bb->v, &b));
1923     PetscCall(VecGetArray(xx->v, &x));
1924     t = a->solves_work;
1925 
1926     PetscCall(ISGetIndices(isrow, &rp));
1927 
1928     /* solve U^T*D*y = perm(b) by forward substitution */
1929     for (k = 0; k < mbs; k++) {
1930       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1931     }
1932     for (k = 0; k < mbs; k++) {
1933       v  = aa + ai[k];
1934       vj = aj + ai[k];
1935       nz = ai[k + 1] - ai[k] - 1;
1936       for (j = 0; j < nz; j++) {
1937         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
1938         v++;
1939         vj++;
1940       }
1941       for (i = 0; i < n; i++) t[n * k + i] *= aa[nz]; /* note: aa[nz] = 1/D(k) */
1942     }
1943 
1944     /* solve U*perm(x) = y by back substitution */
1945     for (k = mbs - 1; k >= 0; k--) {
1946       v  = aa + ai[k] - 1;
1947       vj = aj + ai[k] - 1;
1948       nz = ai[k + 1] - ai[k] - 1;
1949       for (j = 0; j < nz; j++) {
1950         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
1951         v++;
1952         vj++;
1953       }
1954       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
1955     }
1956 
1957     PetscCall(ISRestoreIndices(isrow, &rp));
1958     PetscCall(VecRestoreArrayRead(bb->v, &b));
1959     PetscCall(VecRestoreArray(xx->v, &x));
1960     PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
1961   }
1962   PetscFunctionReturn(PETSC_SUCCESS);
1963 }
1964 
1965 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A, Vecs bb, Vecs xx)
1966 {
1967   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data;
1968 
1969   PetscFunctionBegin;
1970   if (A->rmap->bs == 1) {
1971     PetscCall(MatSolve_SeqSBAIJ_1_inplace(A, bb->v, xx->v));
1972   } else {
1973     IS                 isrow = a->row;
1974     const PetscInt    *vj, mbs = a->mbs, *ai = a->i, *aj = a->j, *rp;
1975     const MatScalar   *aa = a->a, *v;
1976     PetscScalar       *x, *t;
1977     const PetscScalar *b;
1978     PetscInt           nz, k, n, i;
1979 
1980     if (bb->n > a->solves_work_n) {
1981       PetscCall(PetscFree(a->solves_work));
1982       PetscCall(PetscMalloc1(bb->n * A->rmap->N, &a->solves_work));
1983       a->solves_work_n = bb->n;
1984     }
1985     n = bb->n;
1986     PetscCall(VecGetArrayRead(bb->v, &b));
1987     PetscCall(VecGetArray(xx->v, &x));
1988     t = a->solves_work;
1989 
1990     PetscCall(ISGetIndices(isrow, &rp));
1991 
1992     /* solve U^T*D*y = perm(b) by forward substitution */
1993     for (k = 0; k < mbs; k++) {
1994       for (i = 0; i < n; i++) t[n * k + i] = b[rp[k] + i * mbs]; /* values are stored interlaced in t */
1995     }
1996     for (k = 0; k < mbs; k++) {
1997       v  = aa + ai[k];
1998       vj = aj + ai[k];
1999       nz = ai[k + 1] - ai[k];
2000       while (nz--) {
2001         for (i = 0; i < n; i++) t[n * (*vj) + i] += (*v) * t[n * k + i];
2002         v++;
2003         vj++;
2004       }
2005       for (i = 0; i < n; i++) t[n * k + i] *= aa[k]; /* note: aa[k] = 1/D(k) */
2006     }
2007 
2008     /* solve U*perm(x) = y by back substitution */
2009     for (k = mbs - 1; k >= 0; k--) {
2010       v  = aa + ai[k];
2011       vj = aj + ai[k];
2012       nz = ai[k + 1] - ai[k];
2013       while (nz--) {
2014         for (i = 0; i < n; i++) t[n * k + i] += (*v) * t[n * (*vj) + i];
2015         v++;
2016         vj++;
2017       }
2018       for (i = 0; i < n; i++) x[rp[k] + i * mbs] = t[n * k + i];
2019     }
2020 
2021     PetscCall(ISRestoreIndices(isrow, &rp));
2022     PetscCall(VecRestoreArrayRead(bb->v, &b));
2023     PetscCall(VecRestoreArray(xx->v, &x));
2024     PetscCall(PetscLogFlops(bb->n * (4.0 * a->nz - 3.0 * mbs)));
2025   }
2026   PetscFunctionReturn(PETSC_SUCCESS);
2027 }
2028 
2029 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2030 {
2031   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2032   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2033   const MatScalar   *aa = a->a, *v;
2034   const PetscScalar *b;
2035   PetscScalar       *x, xi;
2036   PetscInt           nz, i, j;
2037 
2038   PetscFunctionBegin;
2039   PetscCall(VecGetArrayRead(bb, &b));
2040   PetscCall(VecGetArray(xx, &x));
2041   /* solve U^T*D*y = b by forward substitution */
2042   PetscCall(PetscArraycpy(x, b, mbs));
2043   for (i = 0; i < mbs; i++) {
2044     v  = aa + ai[i];
2045     vj = aj + ai[i];
2046     xi = x[i];
2047     nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2048     for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2049     x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2050   }
2051   /* solve U*x = y by backward substitution */
2052   for (i = mbs - 2; i >= 0; i--) {
2053     xi = x[i];
2054     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2055     vj = aj + adiag[i] - 1;
2056     nz = ai[i + 1] - ai[i] - 1;
2057     for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2058     x[i] = xi;
2059   }
2060   PetscCall(VecRestoreArrayRead(bb, &b));
2061   PetscCall(VecRestoreArray(xx, &x));
2062   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2063   PetscFunctionReturn(PETSC_SUCCESS);
2064 }
2065 
2066 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Mat B, Mat X)
2067 {
2068   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2069   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj, *adiag = a->diag;
2070   const MatScalar   *aa = a->a, *v;
2071   const PetscScalar *b;
2072   PetscScalar       *x, xi;
2073   PetscInt           nz, i, j, neq, ldb, ldx;
2074   PetscBool          isdense;
2075 
2076   PetscFunctionBegin;
2077   if (!mbs) PetscFunctionReturn(PETSC_SUCCESS);
2078   PetscCall(PetscObjectTypeCompare((PetscObject)B, MATSEQDENSE, &isdense));
2079   PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "B matrix must be a SeqDense matrix");
2080   if (X != B) {
2081     PetscCall(PetscObjectTypeCompare((PetscObject)X, MATSEQDENSE, &isdense));
2082     PetscCheck(isdense, PETSC_COMM_SELF, PETSC_ERR_ARG_INCOMP, "X matrix must be a SeqDense matrix");
2083   }
2084   PetscCall(MatDenseGetArrayRead(B, &b));
2085   PetscCall(MatDenseGetLDA(B, &ldb));
2086   PetscCall(MatDenseGetArray(X, &x));
2087   PetscCall(MatDenseGetLDA(X, &ldx));
2088   for (neq = 0; neq < B->cmap->n; neq++) {
2089     /* solve U^T*D*y = b by forward substitution */
2090     PetscCall(PetscArraycpy(x, b, mbs));
2091     for (i = 0; i < mbs; i++) {
2092       v  = aa + ai[i];
2093       vj = aj + ai[i];
2094       xi = x[i];
2095       nz = ai[i + 1] - ai[i] - 1; /* exclude diag[i] */
2096       for (j = 0; j < nz; j++) x[vj[j]] += v[j] * xi;
2097       x[i] = xi * v[nz]; /* v[nz] = aa[diag[i]] = 1/D(i) */
2098     }
2099     /* solve U*x = y by backward substitution */
2100     for (i = mbs - 2; i >= 0; i--) {
2101       xi = x[i];
2102       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2103       vj = aj + adiag[i] - 1;
2104       nz = ai[i + 1] - ai[i] - 1;
2105       for (j = 0; j < nz; j++) xi += v[-j] * x[vj[-j]];
2106       x[i] = xi;
2107     }
2108     b += ldb;
2109     x += ldx;
2110   }
2111   PetscCall(MatDenseRestoreArrayRead(B, &b));
2112   PetscCall(MatDenseRestoreArray(X, &x));
2113   PetscCall(PetscLogFlops(B->cmap->n * (4.0 * a->nz - 3 * mbs)));
2114   PetscFunctionReturn(PETSC_SUCCESS);
2115 }
2116 
2117 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2118 {
2119   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2120   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2121   const MatScalar   *aa = a->a, *v;
2122   PetscScalar       *x, xk;
2123   const PetscScalar *b;
2124   PetscInt           nz, k;
2125 
2126   PetscFunctionBegin;
2127   PetscCall(VecGetArrayRead(bb, &b));
2128   PetscCall(VecGetArray(xx, &x));
2129 
2130   /* solve U^T*D*y = b by forward substitution */
2131   PetscCall(PetscArraycpy(x, b, mbs));
2132   for (k = 0; k < mbs; k++) {
2133     v  = aa + ai[k] + 1;
2134     vj = aj + ai[k] + 1;
2135     xk = x[k];
2136     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2137     while (nz--) x[*vj++] += (*v++) * xk;
2138     x[k] = xk * aa[ai[k]]; /* note: aa[diag[k]] = 1/D(k) */
2139   }
2140 
2141   /* solve U*x = y by back substitution */
2142   for (k = mbs - 2; k >= 0; k--) {
2143     v  = aa + ai[k] + 1;
2144     vj = aj + ai[k] + 1;
2145     xk = x[k];
2146     nz = ai[k + 1] - ai[k] - 1;
2147     while (nz--) xk += (*v++) * x[*vj++];
2148     x[k] = xk;
2149   }
2150 
2151   PetscCall(VecRestoreArrayRead(bb, &b));
2152   PetscCall(VecRestoreArray(xx, &x));
2153   PetscCall(PetscLogFlops(4.0 * a->nz - 3 * mbs));
2154   PetscFunctionReturn(PETSC_SUCCESS);
2155 }
2156 
2157 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2158 {
2159   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2160   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2161   const MatScalar   *aa = a->a, *v;
2162   PetscReal          diagk;
2163   PetscScalar       *x;
2164   const PetscScalar *b;
2165   PetscInt           nz, k;
2166 
2167   PetscFunctionBegin;
2168   /* solve U^T*D^(1/2)*x = b by forward substitution */
2169   PetscCall(VecGetArrayRead(bb, &b));
2170   PetscCall(VecGetArray(xx, &x));
2171   PetscCall(PetscArraycpy(x, b, mbs));
2172   for (k = 0; k < mbs; k++) {
2173     v  = aa + ai[k];
2174     vj = aj + ai[k];
2175     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2176     while (nz--) x[*vj++] += (*v++) * x[k];
2177     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2178     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[adiag[k]]), (double)PetscImaginaryPart(aa[adiag[k]]));
2179     x[k] *= PetscSqrtReal(diagk);
2180   }
2181   PetscCall(VecRestoreArrayRead(bb, &b));
2182   PetscCall(VecRestoreArray(xx, &x));
2183   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2184   PetscFunctionReturn(PETSC_SUCCESS);
2185 }
2186 
2187 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2188 {
2189   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2190   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2191   const MatScalar   *aa = a->a, *v;
2192   PetscReal          diagk;
2193   PetscScalar       *x;
2194   const PetscScalar *b;
2195   PetscInt           nz, k;
2196 
2197   PetscFunctionBegin;
2198   /* solve U^T*D^(1/2)*x = b by forward substitution */
2199   PetscCall(VecGetArrayRead(bb, &b));
2200   PetscCall(VecGetArray(xx, &x));
2201   PetscCall(PetscArraycpy(x, b, mbs));
2202   for (k = 0; k < mbs; k++) {
2203     v  = aa + ai[k] + 1;
2204     vj = aj + ai[k] + 1;
2205     nz = ai[k + 1] - ai[k] - 1; /* exclude diag[k] */
2206     while (nz--) x[*vj++] += (*v++) * x[k];
2207     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2208     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal (%g,%g) must be real and nonnegative", (double)PetscRealPart(aa[ai[k]]), (double)PetscImaginaryPart(aa[ai[k]]));
2209     x[k] *= PetscSqrtReal(diagk);
2210   }
2211   PetscCall(VecRestoreArrayRead(bb, &b));
2212   PetscCall(VecRestoreArray(xx, &x));
2213   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2214   PetscFunctionReturn(PETSC_SUCCESS);
2215 }
2216 
2217 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A, Vec bb, Vec xx)
2218 {
2219   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2220   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vj;
2221   const MatScalar   *aa = a->a, *v;
2222   PetscReal          diagk;
2223   PetscScalar       *x;
2224   const PetscScalar *b;
2225   PetscInt           nz, k;
2226 
2227   PetscFunctionBegin;
2228   /* solve D^(1/2)*U*x = b by back substitution */
2229   PetscCall(VecGetArrayRead(bb, &b));
2230   PetscCall(VecGetArray(xx, &x));
2231 
2232   for (k = mbs - 1; k >= 0; k--) {
2233     v     = aa + ai[k];
2234     vj    = aj + ai[k];
2235     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2236     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2237     x[k] = PetscSqrtReal(diagk) * b[k];
2238     nz   = ai[k + 1] - ai[k] - 1;
2239     while (nz--) x[k] += (*v++) * x[*vj++];
2240   }
2241   PetscCall(VecRestoreArrayRead(bb, &b));
2242   PetscCall(VecRestoreArray(xx, &x));
2243   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2244   PetscFunctionReturn(PETSC_SUCCESS);
2245 }
2246 
2247 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx)
2248 {
2249   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ *)A->data;
2250   const PetscInt     mbs = a->mbs, *ai = a->i, *aj = a->j, *vj;
2251   const MatScalar   *aa = a->a, *v;
2252   PetscReal          diagk;
2253   PetscScalar       *x;
2254   const PetscScalar *b;
2255   PetscInt           nz, k;
2256 
2257   PetscFunctionBegin;
2258   /* solve D^(1/2)*U*x = b by back substitution */
2259   PetscCall(VecGetArrayRead(bb, &b));
2260   PetscCall(VecGetArray(xx, &x));
2261 
2262   for (k = mbs - 1; k >= 0; k--) {
2263     v     = aa + ai[k] + 1;
2264     vj    = aj + ai[k] + 1;
2265     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2266     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0, PETSC_COMM_SELF, PETSC_ERR_SUP, "Diagonal must be real and nonnegative");
2267     x[k] = PetscSqrtReal(diagk) * b[k];
2268     nz   = ai[k + 1] - ai[k] - 1;
2269     while (nz--) x[k] += (*v++) * x[*vj++];
2270   }
2271   PetscCall(VecRestoreArrayRead(bb, &b));
2272   PetscCall(VecRestoreArray(xx, &x));
2273   PetscCall(PetscLogFlops(2.0 * a->nz - mbs));
2274   PetscFunctionReturn(PETSC_SUCCESS);
2275 }
2276 
2277 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2278 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B, Mat A, IS perm, const MatFactorInfo *info)
2279 {
2280   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data, *b;
2281   const PetscInt *rip, mbs    = a->mbs, *ai, *aj;
2282   PetscInt       *jutmp, bs   = A->rmap->bs, i;
2283   PetscInt        m, reallocs = 0, *levtmp;
2284   PetscInt       *prowl, *q, jmin, jmax, juidx, nzk, qm, *iu, *ju, k, j, vj, umax, maxadd;
2285   PetscInt        incrlev, *lev, shift, prow, nz;
2286   PetscReal       f = info->fill, levels = info->levels;
2287   PetscBool       perm_identity;
2288 
2289   PetscFunctionBegin;
2290   /* check whether perm is the identity mapping */
2291   PetscCall(ISIdentity(perm, &perm_identity));
2292 
2293   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2294   a->permute = PETSC_FALSE;
2295   ai         = a->i;
2296   aj         = a->j;
2297 
2298   /* initialization */
2299   PetscCall(ISGetIndices(perm, &rip));
2300   umax = (PetscInt)(f * ai[mbs] + 1);
2301   PetscCall(PetscMalloc1(umax, &lev));
2302   umax += mbs + 1;
2303   shift = mbs + 1;
2304   PetscCall(PetscMalloc1(mbs + 1, &iu));
2305   PetscCall(PetscMalloc1(umax, &ju));
2306   iu[0] = mbs + 1;
2307   juidx = mbs + 1;
2308   /* prowl: linked list for pivot row */
2309   PetscCall(PetscMalloc3(mbs, &prowl, mbs, &q, mbs, &levtmp));
2310   /* q: linked list for col index */
2311 
2312   for (i = 0; i < mbs; i++) {
2313     prowl[i] = mbs;
2314     q[i]     = 0;
2315   }
2316 
2317   /* for each row k */
2318   for (k = 0; k < mbs; k++) {
2319     nzk  = 0;
2320     q[k] = mbs;
2321     /* copy current row into linked list */
2322     nz = ai[rip[k] + 1] - ai[rip[k]];
2323     j  = ai[rip[k]];
2324     while (nz--) {
2325       vj = rip[aj[j++]];
2326       if (vj > k) {
2327         qm = k;
2328         do {
2329           m  = qm;
2330           qm = q[m];
2331         } while (qm < vj);
2332         PetscCheck(qm != vj, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Duplicate entry in A");
2333         nzk++;
2334         q[m]       = vj;
2335         q[vj]      = qm;
2336         levtmp[vj] = 0; /* initialize lev for nonzero element */
2337       }
2338     }
2339 
2340     /* modify nonzero structure of k-th row by computing fill-in
2341        for each row prow to be merged in */
2342     prow = k;
2343     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2344 
2345     while (prow < k) {
2346       /* merge row prow into k-th row */
2347       jmin = iu[prow] + 1;
2348       jmax = iu[prow + 1];
2349       qm   = k;
2350       for (j = jmin; j < jmax; j++) {
2351         incrlev = lev[j - shift] + 1;
2352         if (incrlev > levels) continue;
2353         vj = ju[j];
2354         do {
2355           m  = qm;
2356           qm = q[m];
2357         } while (qm < vj);
2358         if (qm != vj) { /* a fill */
2359           nzk++;
2360           q[m]       = vj;
2361           q[vj]      = qm;
2362           qm         = vj;
2363           levtmp[vj] = incrlev;
2364         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2365       }
2366       prow = prowl[prow]; /* next pivot row */
2367     }
2368 
2369     /* add k to row list for first nonzero element in k-th row */
2370     if (nzk > 1) {
2371       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2372       prowl[k] = prowl[i];
2373       prowl[i] = k;
2374     }
2375     iu[k + 1] = iu[k] + nzk;
2376 
2377     /* allocate more space to ju and lev if needed */
2378     if (iu[k + 1] > umax) {
2379       /* estimate how much additional space we will need */
2380       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2381       /* just double the memory each time */
2382       maxadd = umax;
2383       if (maxadd < nzk) maxadd = (mbs - k) * (nzk + 1) / 2;
2384       umax += maxadd;
2385 
2386       /* allocate a longer ju */
2387       PetscCall(PetscMalloc1(umax, &jutmp));
2388       PetscCall(PetscArraycpy(jutmp, ju, iu[k]));
2389       PetscCall(PetscFree(ju));
2390       ju = jutmp;
2391 
2392       PetscCall(PetscMalloc1(umax, &jutmp));
2393       PetscCall(PetscArraycpy(jutmp, lev, iu[k] - shift));
2394       PetscCall(PetscFree(lev));
2395       lev = jutmp;
2396       reallocs += 2; /* count how many times we realloc */
2397     }
2398 
2399     /* save nonzero structure of k-th row in ju */
2400     i = k;
2401     while (nzk--) {
2402       i                  = q[i];
2403       ju[juidx]          = i;
2404       lev[juidx - shift] = levtmp[i];
2405       juidx++;
2406     }
2407   }
2408 
2409 #if defined(PETSC_USE_INFO)
2410   if (ai[mbs] != 0) {
2411     PetscReal af = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2412     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)f, (double)af));
2413     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2414     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g);\n", (double)af));
2415     PetscCall(PetscInfo(A, "for best performance.\n"));
2416   } else {
2417     PetscCall(PetscInfo(A, "Empty matrix\n"));
2418   }
2419 #endif
2420 
2421   PetscCall(ISRestoreIndices(perm, &rip));
2422   PetscCall(PetscFree3(prowl, q, levtmp));
2423   PetscCall(PetscFree(lev));
2424 
2425   /* put together the new matrix */
2426   PetscCall(MatSeqSBAIJSetPreallocation(B, bs, 0, NULL));
2427 
2428   b = (Mat_SeqSBAIJ *)(B)->data;
2429   PetscCall(PetscFree2(b->imax, b->ilen));
2430 
2431   b->singlemalloc = PETSC_FALSE;
2432   b->free_a       = PETSC_TRUE;
2433   b->free_ij      = PETSC_TRUE;
2434   /* the next line frees the default space generated by the Create() */
2435   PetscCall(PetscFree3(b->a, b->j, b->i));
2436   PetscCall(PetscMalloc1((iu[mbs] + 1) * a->bs2, &b->a));
2437   b->j    = ju;
2438   b->i    = iu;
2439   b->diag = NULL;
2440   b->ilen = NULL;
2441   b->imax = NULL;
2442 
2443   PetscCall(ISDestroy(&b->row));
2444   PetscCall(ISDestroy(&b->icol));
2445   b->row  = perm;
2446   b->icol = perm;
2447   PetscCall(PetscObjectReference((PetscObject)perm));
2448   PetscCall(PetscObjectReference((PetscObject)perm));
2449   PetscCall(PetscMalloc1(bs * mbs + bs, &b->solve_work));
2450   /* In b structure:  Free imax, ilen, old a, old j.
2451      Allocate idnew, solve_work, new a, new j */
2452   b->maxnz = b->nz = iu[mbs];
2453 
2454   (B)->info.factor_mallocs   = reallocs;
2455   (B)->info.fill_ratio_given = f;
2456   if (ai[mbs] != 0) {
2457     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs]) / ((PetscReal)ai[mbs]);
2458   } else {
2459     (B)->info.fill_ratio_needed = 0.0;
2460   }
2461   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B, perm_identity));
2462   PetscFunctionReturn(PETSC_SUCCESS);
2463 }
2464 
2465 /*
2466   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2467 */
2468 #include <petscbt.h>
2469 #include <../src/mat/utils/freespace.h>
2470 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2471 {
2472   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data, *b;
2473   PetscBool          perm_identity, free_ij = PETSC_TRUE, missing;
2474   PetscInt           bs = A->rmap->bs, am = a->mbs, d, *ai = a->i, *aj = a->j;
2475   const PetscInt    *rip;
2476   PetscInt           reallocs = 0, i, *ui, *udiag, *cols;
2477   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2478   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *uj, **uj_ptr, **uj_lvl_ptr;
2479   PetscReal          fill = info->fill, levels = info->levels;
2480   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2481   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2482   PetscBT            lnkbt;
2483 
2484   PetscFunctionBegin;
2485   PetscCheck(A->rmap->n == A->cmap->n, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT, A->rmap->n, A->cmap->n);
2486   PetscCall(MatMissingDiagonal(A, &missing, &d));
2487   PetscCheck(!missing, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONGSTATE, "Matrix is missing diagonal entry %" PetscInt_FMT, d);
2488   if (bs > 1) {
2489     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact, A, perm, info));
2490     PetscFunctionReturn(PETSC_SUCCESS);
2491   }
2492 
2493   /* check whether perm is the identity mapping */
2494   PetscCall(ISIdentity(perm, &perm_identity));
2495   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2496   a->permute = PETSC_FALSE;
2497 
2498   PetscCall(PetscMalloc1(am + 1, &ui));
2499   PetscCall(PetscMalloc1(am + 1, &udiag));
2500   ui[0] = 0;
2501 
2502   /* ICC(0) without matrix ordering: simply rearrange column indices */
2503   if (!levels) {
2504     /* reuse the column pointers and row offsets for memory savings */
2505     for (i = 0; i < am; i++) {
2506       ncols     = ai[i + 1] - ai[i];
2507       ui[i + 1] = ui[i] + ncols;
2508       udiag[i]  = ui[i + 1] - 1; /* points to the last entry of U(i,:) */
2509     }
2510     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2511     cols = uj;
2512     for (i = 0; i < am; i++) {
2513       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2514       ncols = ai[i + 1] - ai[i] - 1;
2515       for (j = 0; j < ncols; j++) *cols++ = aj[j];
2516       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2517     }
2518   } else { /* case: levels>0 */
2519     PetscCall(ISGetIndices(perm, &rip));
2520 
2521     /* initialization */
2522     /* jl: linked list for storing indices of the pivot rows
2523        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2524     PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2525     for (i = 0; i < am; i++) {
2526       jl[i] = am;
2527       il[i] = 0;
2528     }
2529 
2530     /* create and initialize a linked list for storing column indices of the active row k */
2531     nlnk = am + 1;
2532     PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2533 
2534     /* initial FreeSpace size is fill*(ai[am]+1) */
2535     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2536 
2537     current_space = free_space;
2538 
2539     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2540 
2541     current_space_lvl = free_space_lvl;
2542 
2543     for (k = 0; k < am; k++) { /* for each active row k */
2544       /* initialize lnk by the column indices of row k */
2545       nzk   = 0;
2546       ncols = ai[k + 1] - ai[k];
2547       PetscCheck(ncols, PETSC_COMM_SELF, PETSC_ERR_MAT_CH_ZRPVT, "Empty row %" PetscInt_FMT " in matrix ", k);
2548       cols = aj + ai[k];
2549       PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2550       nzk += nlnk;
2551 
2552       /* update lnk by computing fill-in for each pivot row to be merged in */
2553       prow = jl[k]; /* 1st pivot row */
2554 
2555       while (prow < k) {
2556         nextprow = jl[prow];
2557 
2558         /* merge prow into k-th row */
2559         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2560         jmax  = ui[prow + 1];
2561         ncols = jmax - jmin;
2562         i     = jmin - ui[prow];
2563 
2564         cols = uj_ptr[prow] + i;     /* points to the 2nd nzero entry in U(prow,k:am-1) */
2565         uj   = uj_lvl_ptr[prow] + i; /* levels of cols */
2566         j    = *(uj - 1);
2567         PetscCall(PetscICCLLAddSorted(ncols, cols, levels, uj, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2568         nzk += nlnk;
2569 
2570         /* update il and jl for prow */
2571         if (jmin < jmax) {
2572           il[prow] = jmin;
2573 
2574           j        = *cols;
2575           jl[prow] = jl[j];
2576           jl[j]    = prow;
2577         }
2578         prow = nextprow;
2579       }
2580 
2581       /* if free space is not available, make more free space */
2582       if (current_space->local_remaining < nzk) {
2583         i = am - k + 1;                                    /* num of unfactored rows */
2584         i = PetscIntMultTruncate(i, PetscMin(nzk, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2585         PetscCall(PetscFreeSpaceGet(i, &current_space));
2586         PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
2587         reallocs++;
2588       }
2589 
2590       /* copy data into free_space and free_space_lvl, then initialize lnk */
2591       PetscCheck(nzk != 0, PETSC_COMM_SELF, PETSC_ERR_ARG_WRONG, "Empty row %" PetscInt_FMT " in ICC matrix factor", k);
2592       PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2593 
2594       /* add the k-th row into il and jl */
2595       if (nzk > 1) {
2596         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2597         jl[k] = jl[i];
2598         jl[i] = k;
2599         il[k] = ui[k] + 1;
2600       }
2601       uj_ptr[k]     = current_space->array;
2602       uj_lvl_ptr[k] = current_space_lvl->array;
2603 
2604       current_space->array += nzk;
2605       current_space->local_used += nzk;
2606       current_space->local_remaining -= nzk;
2607       current_space_lvl->array += nzk;
2608       current_space_lvl->local_used += nzk;
2609       current_space_lvl->local_remaining -= nzk;
2610 
2611       ui[k + 1] = ui[k] + nzk;
2612     }
2613 
2614     PetscCall(ISRestoreIndices(perm, &rip));
2615     PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2616 
2617     /* destroy list of free space and other temporary array(s) */
2618     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2619     PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space, uj, am, ui, udiag)); /* store matrix factor  */
2620     PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2621     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2622 
2623   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2624 
2625   /* put together the new matrix in MATSEQSBAIJ format */
2626   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2627 
2628   b = (Mat_SeqSBAIJ *)(fact)->data;
2629   PetscCall(PetscFree2(b->imax, b->ilen));
2630 
2631   b->singlemalloc = PETSC_FALSE;
2632   b->free_a       = PETSC_TRUE;
2633   b->free_ij      = free_ij;
2634 
2635   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
2636 
2637   b->j         = uj;
2638   b->i         = ui;
2639   b->diag      = udiag;
2640   b->free_diag = PETSC_TRUE;
2641   b->ilen      = NULL;
2642   b->imax      = NULL;
2643   b->row       = perm;
2644   b->col       = perm;
2645 
2646   PetscCall(PetscObjectReference((PetscObject)perm));
2647   PetscCall(PetscObjectReference((PetscObject)perm));
2648 
2649   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2650 
2651   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2652 
2653   b->maxnz = b->nz = ui[am];
2654 
2655   fact->info.factor_mallocs   = reallocs;
2656   fact->info.fill_ratio_given = fill;
2657   if (ai[am] != 0) {
2658     fact->info.fill_ratio_needed = ((PetscReal)ui[am]) / ai[am];
2659   } else {
2660     fact->info.fill_ratio_needed = 0.0;
2661   }
2662 #if defined(PETSC_USE_INFO)
2663   if (ai[am] != 0) {
2664     PetscReal af = fact->info.fill_ratio_needed;
2665     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2666     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2667     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2668   } else {
2669     PetscCall(PetscInfo(A, "Empty matrix\n"));
2670   }
2671 #endif
2672   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2673   PetscFunctionReturn(PETSC_SUCCESS);
2674 }
2675 
2676 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact, Mat A, IS perm, const MatFactorInfo *info)
2677 {
2678   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
2679   Mat_SeqSBAIJ      *b;
2680   PetscBool          perm_identity, free_ij = PETSC_TRUE;
2681   PetscInt           bs = A->rmap->bs, am = a->mbs;
2682   const PetscInt    *cols, *rip, *ai = a->i, *aj = a->j;
2683   PetscInt           reallocs = 0, i, *ui;
2684   PetscInt           jmin, jmax, nzk, k, j, *jl, prow, *il, nextprow;
2685   PetscInt           nlnk, *lnk, *lnk_lvl = NULL, ncols, *cols_lvl, *uj, **uj_ptr, **uj_lvl_ptr;
2686   PetscReal          fill = info->fill, levels = info->levels, ratio_needed;
2687   PetscFreeSpaceList free_space = NULL, current_space = NULL;
2688   PetscFreeSpaceList free_space_lvl = NULL, current_space_lvl = NULL;
2689   PetscBT            lnkbt;
2690 
2691   PetscFunctionBegin;
2692   /*
2693    This code originally uses Modified Sparse Row (MSR) storage
2694    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2695    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2696    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2697    thus the original code in MSR format is still used for these cases.
2698    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2699    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2700   */
2701   if (bs > 1) {
2702     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact, A, perm, info));
2703     PetscFunctionReturn(PETSC_SUCCESS);
2704   }
2705 
2706   /* check whether perm is the identity mapping */
2707   PetscCall(ISIdentity(perm, &perm_identity));
2708   PetscCheck(perm_identity, PETSC_COMM_SELF, PETSC_ERR_SUP, "Matrix reordering is not supported for sbaij matrix. Use aij format");
2709   a->permute = PETSC_FALSE;
2710 
2711   /* special case that simply copies fill pattern */
2712   if (!levels) {
2713     /* reuse the column pointers and row offsets for memory savings */
2714     ui           = a->i;
2715     uj           = a->j;
2716     free_ij      = PETSC_FALSE;
2717     ratio_needed = 1.0;
2718   } else { /* case: levels>0 */
2719     PetscCall(ISGetIndices(perm, &rip));
2720 
2721     /* initialization */
2722     PetscCall(PetscMalloc1(am + 1, &ui));
2723     ui[0] = 0;
2724 
2725     /* jl: linked list for storing indices of the pivot rows
2726        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2727     PetscCall(PetscMalloc4(am, &uj_ptr, am, &uj_lvl_ptr, am, &il, am, &jl));
2728     for (i = 0; i < am; i++) {
2729       jl[i] = am;
2730       il[i] = 0;
2731     }
2732 
2733     /* create and initialize a linked list for storing column indices of the active row k */
2734     nlnk = am + 1;
2735     PetscCall(PetscIncompleteLLCreate(am, am, nlnk, lnk, lnk_lvl, lnkbt));
2736 
2737     /* initial FreeSpace size is fill*(ai[am]+1) */
2738     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space));
2739 
2740     current_space = free_space;
2741 
2742     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill, ai[am] + 1), &free_space_lvl));
2743 
2744     current_space_lvl = free_space_lvl;
2745 
2746     for (k = 0; k < am; k++) { /* for each active row k */
2747       /* initialize lnk by the column indices of row rip[k] */
2748       nzk   = 0;
2749       ncols = ai[rip[k] + 1] - ai[rip[k]];
2750       cols  = aj + ai[rip[k]];
2751       PetscCall(PetscIncompleteLLInit(ncols, cols, am, rip, &nlnk, lnk, lnk_lvl, lnkbt));
2752       nzk += nlnk;
2753 
2754       /* update lnk by computing fill-in for each pivot row to be merged in */
2755       prow = jl[k]; /* 1st pivot row */
2756 
2757       while (prow < k) {
2758         nextprow = jl[prow];
2759 
2760         /* merge prow into k-th row */
2761         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2762         jmax     = ui[prow + 1];
2763         ncols    = jmax - jmin;
2764         i        = jmin - ui[prow];
2765         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2766         j        = *(uj_lvl_ptr[prow] + i - 1);
2767         cols_lvl = uj_lvl_ptr[prow] + i;
2768         PetscCall(PetscICCLLAddSorted(ncols, cols, levels, cols_lvl, am, &nlnk, lnk, lnk_lvl, lnkbt, j));
2769         nzk += nlnk;
2770 
2771         /* update il and jl for prow */
2772         if (jmin < jmax) {
2773           il[prow] = jmin;
2774           j        = *cols;
2775           jl[prow] = jl[j];
2776           jl[j]    = prow;
2777         }
2778         prow = nextprow;
2779       }
2780 
2781       /* if free space is not available, make more free space */
2782       if (current_space->local_remaining < nzk) {
2783         i = am - k + 1;                                                             /* num of unfactored rows */
2784         i = PetscMin(PetscIntMultTruncate(i, nzk), PetscIntMultTruncate(i, i - 1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2785         PetscCall(PetscFreeSpaceGet(i, &current_space));
2786         PetscCall(PetscFreeSpaceGet(i, &current_space_lvl));
2787         reallocs++;
2788       }
2789 
2790       /* copy data into free_space and free_space_lvl, then initialize lnk */
2791       PetscCall(PetscIncompleteLLClean(am, am, nzk, lnk, lnk_lvl, current_space->array, current_space_lvl->array, lnkbt));
2792 
2793       /* add the k-th row into il and jl */
2794       if (nzk - 1 > 0) {
2795         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2796         jl[k] = jl[i];
2797         jl[i] = k;
2798         il[k] = ui[k] + 1;
2799       }
2800       uj_ptr[k]     = current_space->array;
2801       uj_lvl_ptr[k] = current_space_lvl->array;
2802 
2803       current_space->array += nzk;
2804       current_space->local_used += nzk;
2805       current_space->local_remaining -= nzk;
2806       current_space_lvl->array += nzk;
2807       current_space_lvl->local_used += nzk;
2808       current_space_lvl->local_remaining -= nzk;
2809 
2810       ui[k + 1] = ui[k] + nzk;
2811     }
2812 
2813     PetscCall(ISRestoreIndices(perm, &rip));
2814     PetscCall(PetscFree4(uj_ptr, uj_lvl_ptr, il, jl));
2815 
2816     /* destroy list of free space and other temporary array(s) */
2817     PetscCall(PetscMalloc1(ui[am] + 1, &uj));
2818     PetscCall(PetscFreeSpaceContiguous(&free_space, uj));
2819     PetscCall(PetscIncompleteLLDestroy(lnk, lnkbt));
2820     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2821     if (ai[am] != 0) {
2822       ratio_needed = ((PetscReal)ui[am]) / ((PetscReal)ai[am]);
2823     } else {
2824       ratio_needed = 0.0;
2825     }
2826   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2827 
2828   /* put together the new matrix in MATSEQSBAIJ format */
2829   PetscCall(MatSeqSBAIJSetPreallocation(fact, bs, MAT_SKIP_ALLOCATION, NULL));
2830 
2831   b = (Mat_SeqSBAIJ *)(fact)->data;
2832 
2833   PetscCall(PetscFree2(b->imax, b->ilen));
2834 
2835   b->singlemalloc = PETSC_FALSE;
2836   b->free_a       = PETSC_TRUE;
2837   b->free_ij      = free_ij;
2838 
2839   PetscCall(PetscMalloc1(ui[am] + 1, &b->a));
2840 
2841   b->j             = uj;
2842   b->i             = ui;
2843   b->diag          = NULL;
2844   b->ilen          = NULL;
2845   b->imax          = NULL;
2846   b->row           = perm;
2847   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2848 
2849   PetscCall(PetscObjectReference((PetscObject)perm));
2850   b->icol = perm;
2851   PetscCall(PetscObjectReference((PetscObject)perm));
2852   PetscCall(PetscMalloc1(am + 1, &b->solve_work));
2853 
2854   b->maxnz = b->nz = ui[am];
2855 
2856   fact->info.factor_mallocs    = reallocs;
2857   fact->info.fill_ratio_given  = fill;
2858   fact->info.fill_ratio_needed = ratio_needed;
2859 #if defined(PETSC_USE_INFO)
2860   if (ai[am] != 0) {
2861     PetscReal af = fact->info.fill_ratio_needed;
2862     PetscCall(PetscInfo(A, "Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n", reallocs, (double)fill, (double)af));
2863     PetscCall(PetscInfo(A, "Run with -pc_factor_fill %g or use \n", (double)af));
2864     PetscCall(PetscInfo(A, "PCFactorSetFill(pc,%g) for best performance.\n", (double)af));
2865   } else {
2866     PetscCall(PetscInfo(A, "Empty matrix\n"));
2867   }
2868 #endif
2869   if (perm_identity) {
2870     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2871   } else {
2872     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2873   }
2874   PetscFunctionReturn(PETSC_SUCCESS);
2875 }
2876