xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision ffa8c5705e8ab2cf85ee1d14dbe507a6e2eb5283)
1 
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include <../src/mat/impls/sbaij/seq/sbaij.h>
7 #include <../src/mat/impls/baij/seq/baij.h>
8 #include <petsc/private/kernels/blockinvert.h>
9 
10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
11 {
12   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
13   IS                isrow=a->row;
14   PetscInt          mbs  =a->mbs,*ai=a->i,*aj=a->j;
15   const PetscInt    *r;
16   PetscInt          nz,*vj,k,idx,k1;
17   PetscInt          bs =A->rmap->bs,bs2 = a->bs2;
18   const MatScalar   *aa=a->a,*v,*diag;
19   PetscScalar       *x,*xk,*xj,*xk_tmp,*t;
20   const PetscScalar *b;
21 
22   PetscFunctionBegin;
23   PetscCall(VecGetArrayRead(bb,&b));
24   PetscCall(VecGetArray(xx,&x));
25   t    = a->solve_work;
26   PetscCall(ISGetIndices(isrow,&r));
27   PetscCall(PetscMalloc1(bs,&xk_tmp));
28 
29   /* solve U^T * D * y = b by forward substitution */
30   xk = t;
31   for (k=0; k<mbs; k++) { /* t <- perm(b) */
32     idx = bs*r[k];
33     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
34   }
35   for (k=0; k<mbs; k++) {
36     v    = aa + bs2*ai[k];
37     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
38     PetscCall(PetscArraycpy(xk_tmp,xk,bs)); /* xk_tmp <- xk */
39     nz   = ai[k+1] - ai[k];
40     vj   = aj + ai[k];
41     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
42     while (nz--) {
43       /* x(:) += U(k,:)^T*(Dk*xk) */
44       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
45       vj++; xj = t + (*vj)*bs;
46       v       += bs2;
47     }
48     /* xk = inv(Dk)*(Dk*xk) */
49     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
50     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
51   }
52 
53   /* solve U*x = y by back substitution */
54   for (k=mbs-1; k>=0; k--) {
55     v  = aa + bs2*ai[k];
56     xk = t + k*bs;        /* xk */
57     nz = ai[k+1] - ai[k];
58     vj = aj + ai[k];
59     xj = t + (*vj)*bs;
60     while (nz--) {
61       /* xk += U(k,:)*x(:) */
62       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
63       vj++;
64       v += bs2; xj = t + (*vj)*bs;
65     }
66     idx = bs*r[k];
67     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
68   }
69 
70   PetscCall(PetscFree(xk_tmp));
71   PetscCall(ISRestoreIndices(isrow,&r));
72   PetscCall(VecRestoreArrayRead(bb,&b));
73   PetscCall(VecRestoreArray(xx,&x));
74   PetscCall(PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs));
75   PetscFunctionReturn(0);
76 }
77 
78 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
79 {
80   PetscFunctionBegin;
81   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet");
82 }
83 
84 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
85 {
86   PetscFunctionBegin;
87   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented");
88 }
89 
90 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
91 {
92   PetscInt        nz,k;
93   const PetscInt  *vj,bs2 = bs*bs;
94   const MatScalar *v,*diag;
95   PetscScalar     *xk,*xj,*xk_tmp;
96 
97   PetscFunctionBegin;
98   PetscCall(PetscMalloc1(bs,&xk_tmp));
99   for (k=0; k<mbs; k++) {
100     v    = aa + bs2*ai[k];
101     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
102     PetscCall(PetscArraycpy(xk_tmp,xk,bs)); /* xk_tmp <- xk */
103     nz   = ai[k+1] - ai[k];
104     vj   = aj + ai[k];
105     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
106     while (nz--) {
107       /* x(:) += U(k,:)^T*(Dk*xk) */
108       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
109       vj++; xj = x + (*vj)*bs;
110       v       += bs2;
111     }
112     /* xk = inv(Dk)*(Dk*xk) */
113     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
114     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
115   }
116   PetscCall(PetscFree(xk_tmp));
117   PetscFunctionReturn(0);
118 }
119 
120 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
121 {
122   PetscInt        nz,k;
123   const PetscInt  *vj,bs2 = bs*bs;
124   const MatScalar *v;
125   PetscScalar     *xk,*xj;
126 
127   PetscFunctionBegin;
128   for (k=mbs-1; k>=0; k--) {
129     v  = aa + bs2*ai[k];
130     xk = x + k*bs;        /* xk */
131     nz = ai[k+1] - ai[k];
132     vj = aj + ai[k];
133     xj = x + (*vj)*bs;
134     while (nz--) {
135       /* xk += U(k,:)*x(:) */
136       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
137       vj++;
138       v += bs2; xj = x + (*vj)*bs;
139     }
140   }
141   PetscFunctionReturn(0);
142 }
143 
144 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
145 {
146   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
147   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
148   PetscInt          bs =A->rmap->bs;
149   const MatScalar   *aa=a->a;
150   PetscScalar       *x;
151   const PetscScalar *b;
152 
153   PetscFunctionBegin;
154   PetscCall(VecGetArrayRead(bb,&b));
155   PetscCall(VecGetArray(xx,&x));
156 
157   /* solve U^T * D * y = b by forward substitution */
158   PetscCall(PetscArraycpy(x,b,bs*mbs)); /* x <- b */
159   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x));
160 
161   /* solve U*x = y by back substitution */
162   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x));
163 
164   PetscCall(VecRestoreArrayRead(bb,&b));
165   PetscCall(VecRestoreArray(xx,&x));
166   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs));
167   PetscFunctionReturn(0);
168 }
169 
170 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
171 {
172   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
173   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
174   PetscInt          bs =A->rmap->bs;
175   const MatScalar   *aa=a->a;
176   const PetscScalar *b;
177   PetscScalar       *x;
178 
179   PetscFunctionBegin;
180   PetscCall(VecGetArrayRead(bb,&b));
181   PetscCall(VecGetArray(xx,&x));
182   PetscCall(PetscArraycpy(x,b,bs*mbs)); /* x <- b */
183   PetscCall(MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x));
184   PetscCall(VecRestoreArrayRead(bb,&b));
185   PetscCall(VecRestoreArray(xx,&x));
186   PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs));
187   PetscFunctionReturn(0);
188 }
189 
190 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
191 {
192   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
193   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
194   PetscInt          bs =A->rmap->bs;
195   const MatScalar   *aa=a->a;
196   const PetscScalar *b;
197   PetscScalar       *x;
198 
199   PetscFunctionBegin;
200   PetscCall(VecGetArrayRead(bb,&b));
201   PetscCall(VecGetArray(xx,&x));
202   PetscCall(PetscArraycpy(x,b,bs*mbs));
203   PetscCall(MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x));
204   PetscCall(VecRestoreArrayRead(bb,&b));
205   PetscCall(VecRestoreArray(xx,&x));
206   PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz-mbs)));
207   PetscFunctionReturn(0);
208 }
209 
210 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
211 {
212   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
213   IS                isrow=a->row;
214   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
215   PetscInt          nz,k,idx;
216   const MatScalar   *aa=a->a,*v,*d;
217   PetscScalar       *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
218   const PetscScalar *b;
219 
220   PetscFunctionBegin;
221   PetscCall(VecGetArrayRead(bb,&b));
222   PetscCall(VecGetArray(xx,&x));
223   t    = a->solve_work;
224   PetscCall(ISGetIndices(isrow,&r));
225 
226   /* solve U^T * D * y = b by forward substitution */
227   tp = t;
228   for (k=0; k<mbs; k++) { /* t <- perm(b) */
229     idx   = 7*r[k];
230     tp[0] = b[idx];
231     tp[1] = b[idx+1];
232     tp[2] = b[idx+2];
233     tp[3] = b[idx+3];
234     tp[4] = b[idx+4];
235     tp[5] = b[idx+5];
236     tp[6] = b[idx+6];
237     tp   += 7;
238   }
239 
240   for (k=0; k<mbs; k++) {
241     v  = aa + 49*ai[k];
242     vj = aj + ai[k];
243     tp = t + k*7;
244     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
245     nz = ai[k+1] - ai[k];
246     tp = t + (*vj)*7;
247     while (nz--) {
248       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
249       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
250       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
251       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
252       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
253       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
254       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
255       vj++;
256       tp = t + (*vj)*7;
257       v += 49;
258     }
259 
260     /* xk = inv(Dk)*(Dk*xk) */
261     d     = aa+k*49;       /* ptr to inv(Dk) */
262     tp    = t + k*7;
263     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
264     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
265     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
266     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
267     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
268     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
269     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
270   }
271 
272   /* solve U*x = y by back substitution */
273   for (k=mbs-1; k>=0; k--) {
274     v  = aa + 49*ai[k];
275     vj = aj + ai[k];
276     tp = t + k*7;
277     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
278     nz = ai[k+1] - ai[k];
279 
280     tp = t + (*vj)*7;
281     while (nz--) {
282       /* xk += U(k,:)*x(:) */
283       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
284       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
285       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
286       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
287       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
288       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
289       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
290       vj++;
291       tp = t + (*vj)*7;
292       v += 49;
293     }
294     tp       = t + k*7;
295     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
296     idx      = 7*r[k];
297     x[idx]   = x0;
298     x[idx+1] = x1;
299     x[idx+2] = x2;
300     x[idx+3] = x3;
301     x[idx+4] = x4;
302     x[idx+5] = x5;
303     x[idx+6] = x6;
304   }
305 
306   PetscCall(ISRestoreIndices(isrow,&r));
307   PetscCall(VecRestoreArrayRead(bb,&b));
308   PetscCall(VecRestoreArray(xx,&x));
309   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs));
310   PetscFunctionReturn(0);
311 }
312 
313 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
314 {
315   const MatScalar *v,*d;
316   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
317   PetscInt        nz,k;
318   const PetscInt  *vj;
319 
320   PetscFunctionBegin;
321   for (k=0; k<mbs; k++) {
322     v  = aa + 49*ai[k];
323     xp = x + k*7;
324     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
325     nz = ai[k+1] - ai[k];
326     vj = aj + ai[k];
327     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
328     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
329     while (nz--) {
330       xp = x + (*vj)*7;
331       /* x(:) += U(k,:)^T*(Dk*xk) */
332       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
333       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
334       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
335       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
336       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
337       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
338       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
339       vj++;
340       v += 49;
341     }
342     /* xk = inv(Dk)*(Dk*xk) */
343     d     = aa+k*49;       /* ptr to inv(Dk) */
344     xp    = x + k*7;
345     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
346     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
347     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
348     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
349     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
350     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
351     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
352   }
353   PetscFunctionReturn(0);
354 }
355 
356 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
357 {
358   const MatScalar *v;
359   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
360   PetscInt        nz,k;
361   const PetscInt  *vj;
362 
363   PetscFunctionBegin;
364   for (k=mbs-1; k>=0; k--) {
365     v  = aa + 49*ai[k];
366     xp = x + k*7;
367     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
368     nz = ai[k+1] - ai[k];
369     vj = aj + ai[k];
370     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
371     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
372     while (nz--) {
373       xp = x + (*vj)*7;
374       /* xk += U(k,:)*x(:) */
375       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
376       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
377       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
378       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
379       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
380       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
381       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
382       vj++;
383       v += 49;
384     }
385     xp = x + k*7;
386     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
387   }
388   PetscFunctionReturn(0);
389 }
390 
391 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
392 {
393   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
394   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
395   const MatScalar   *aa=a->a;
396   PetscScalar       *x;
397   const PetscScalar *b;
398 
399   PetscFunctionBegin;
400   PetscCall(VecGetArrayRead(bb,&b));
401   PetscCall(VecGetArray(xx,&x));
402 
403   /* solve U^T * D * y = b by forward substitution */
404   PetscCall(PetscArraycpy(x,b,7*mbs)); /* x <- b */
405   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x));
406 
407   /* solve U*x = y by back substitution */
408   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x));
409 
410   PetscCall(VecRestoreArrayRead(bb,&b));
411   PetscCall(VecRestoreArray(xx,&x));
412   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs));
413   PetscFunctionReturn(0);
414 }
415 
416 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
417 {
418   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
419   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
420   const MatScalar   *aa=a->a;
421   PetscScalar       *x;
422   const PetscScalar *b;
423 
424   PetscFunctionBegin;
425   PetscCall(VecGetArrayRead(bb,&b));
426   PetscCall(VecGetArray(xx,&x));
427   PetscCall(PetscArraycpy(x,b,7*mbs));
428   PetscCall(MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x));
429   PetscCall(VecRestoreArrayRead(bb,&b));
430   PetscCall(VecRestoreArray(xx,&x));
431   PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs));
432   PetscFunctionReturn(0);
433 }
434 
435 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
436 {
437   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
438   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
439   const MatScalar   *aa=a->a;
440   PetscScalar       *x;
441   const PetscScalar *b;
442 
443   PetscFunctionBegin;
444   PetscCall(VecGetArrayRead(bb,&b));
445   PetscCall(VecGetArray(xx,&x));
446   PetscCall(PetscArraycpy(x,b,7*mbs));
447   PetscCall(MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x));
448   PetscCall(VecRestoreArrayRead(bb,&b));
449   PetscCall(VecRestoreArray(xx,&x));
450   PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz-mbs)));
451   PetscFunctionReturn(0);
452 }
453 
454 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
455 {
456   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
457   IS                isrow=a->row;
458   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
459   PetscInt          nz,k,idx;
460   const MatScalar   *aa=a->a,*v,*d;
461   PetscScalar       *x,x0,x1,x2,x3,x4,x5,*t,*tp;
462   const PetscScalar *b;
463 
464   PetscFunctionBegin;
465   PetscCall(VecGetArrayRead(bb,&b));
466   PetscCall(VecGetArray(xx,&x));
467   t    = a->solve_work;
468   PetscCall(ISGetIndices(isrow,&r));
469 
470   /* solve U^T * D * y = b by forward substitution */
471   tp = t;
472   for (k=0; k<mbs; k++) { /* t <- perm(b) */
473     idx   = 6*r[k];
474     tp[0] = b[idx];
475     tp[1] = b[idx+1];
476     tp[2] = b[idx+2];
477     tp[3] = b[idx+3];
478     tp[4] = b[idx+4];
479     tp[5] = b[idx+5];
480     tp   += 6;
481   }
482 
483   for (k=0; k<mbs; k++) {
484     v  = aa + 36*ai[k];
485     vj = aj + ai[k];
486     tp = t + k*6;
487     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
488     nz = ai[k+1] - ai[k];
489     tp = t + (*vj)*6;
490     while (nz--) {
491       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
492       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
493       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
494       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
495       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
496       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
497       vj++;
498       tp = t + (*vj)*6;
499       v += 36;
500     }
501 
502     /* xk = inv(Dk)*(Dk*xk) */
503     d     = aa+k*36;       /* ptr to inv(Dk) */
504     tp    = t + k*6;
505     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
506     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
507     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
508     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
509     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
510     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
511   }
512 
513   /* solve U*x = y by back substitution */
514   for (k=mbs-1; k>=0; k--) {
515     v  = aa + 36*ai[k];
516     vj = aj + ai[k];
517     tp = t + k*6;
518     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
519     nz = ai[k+1] - ai[k];
520 
521     tp = t + (*vj)*6;
522     while (nz--) {
523       /* xk += U(k,:)*x(:) */
524       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
525       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
526       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
527       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
528       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
529       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
530       vj++;
531       tp = t + (*vj)*6;
532       v += 36;
533     }
534     tp       = t + k*6;
535     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
536     idx      = 6*r[k];
537     x[idx]   = x0;
538     x[idx+1] = x1;
539     x[idx+2] = x2;
540     x[idx+3] = x3;
541     x[idx+4] = x4;
542     x[idx+5] = x5;
543   }
544 
545   PetscCall(ISRestoreIndices(isrow,&r));
546   PetscCall(VecRestoreArrayRead(bb,&b));
547   PetscCall(VecRestoreArray(xx,&x));
548   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs));
549   PetscFunctionReturn(0);
550 }
551 
552 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
553 {
554   const MatScalar *v,*d;
555   PetscScalar     *xp,x0,x1,x2,x3,x4,x5;
556   PetscInt        nz,k;
557   const PetscInt  *vj;
558 
559   PetscFunctionBegin;
560   for (k=0; k<mbs; k++) {
561     v  = aa + 36*ai[k];
562     xp = x + k*6;
563     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
564     nz = ai[k+1] - ai[k];
565     vj = aj + ai[k];
566     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
567     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
568     while (nz--) {
569       xp = x + (*vj)*6;
570       /* x(:) += U(k,:)^T*(Dk*xk) */
571       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
572       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
573       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
574       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
575       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
576       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
577       vj++;
578       v += 36;
579     }
580     /* xk = inv(Dk)*(Dk*xk) */
581     d     = aa+k*36;       /* ptr to inv(Dk) */
582     xp    = x + k*6;
583     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
584     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
585     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
586     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
587     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
588     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
589   }
590   PetscFunctionReturn(0);
591 }
592 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
593 {
594   const MatScalar   *v;
595   PetscScalar       *xp,x0,x1,x2,x3,x4,x5;
596   PetscInt          nz,k;
597   const PetscInt    *vj;
598 
599   PetscFunctionBegin;
600   for (k=mbs-1; k>=0; k--) {
601     v  = aa + 36*ai[k];
602     xp = x + k*6;
603     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
604     nz = ai[k+1] - ai[k];
605     vj = aj + ai[k];
606     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
607     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
608     while (nz--) {
609       xp = x + (*vj)*6;
610       /* xk += U(k,:)*x(:) */
611       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
612       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
613       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
614       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
615       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
616       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
617       vj++;
618       v += 36;
619     }
620     xp   = x + k*6;
621     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
622   }
623   PetscFunctionReturn(0);
624 }
625 
626 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
627 {
628   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
629   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
630   const MatScalar   *aa=a->a;
631   PetscScalar       *x;
632   const PetscScalar *b;
633 
634   PetscFunctionBegin;
635   PetscCall(VecGetArrayRead(bb,&b));
636   PetscCall(VecGetArray(xx,&x));
637 
638   /* solve U^T * D * y = b by forward substitution */
639   PetscCall(PetscArraycpy(x,b,6*mbs)); /* x <- b */
640   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x));
641 
642   /* solve U*x = y by back substitution */
643   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x));
644 
645   PetscCall(VecRestoreArrayRead(bb,&b));
646   PetscCall(VecRestoreArray(xx,&x));
647   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs));
648   PetscFunctionReturn(0);
649 }
650 
651 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
652 {
653   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
654   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
655   const MatScalar   *aa=a->a;
656   PetscScalar       *x;
657   const PetscScalar *b;
658 
659   PetscFunctionBegin;
660   PetscCall(VecGetArrayRead(bb,&b));
661   PetscCall(VecGetArray(xx,&x));
662   PetscCall(PetscArraycpy(x,b,6*mbs)); /* x <- b */
663   PetscCall(MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x));
664   PetscCall(VecRestoreArrayRead(bb,&b));
665   PetscCall(VecRestoreArray(xx,&x));
666   PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs));
667   PetscFunctionReturn(0);
668 }
669 
670 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
671 {
672   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
673   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
674   const MatScalar   *aa=a->a;
675   PetscScalar       *x;
676   const PetscScalar *b;
677 
678   PetscFunctionBegin;
679   PetscCall(VecGetArrayRead(bb,&b));
680   PetscCall(VecGetArray(xx,&x));
681   PetscCall(PetscArraycpy(x,b,6*mbs)); /* x <- b */
682   PetscCall(MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x));
683   PetscCall(VecRestoreArrayRead(bb,&b));
684   PetscCall(VecRestoreArray(xx,&x));
685   PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz - mbs)));
686   PetscFunctionReturn(0);
687 }
688 
689 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
690 {
691   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
692   IS                isrow=a->row;
693   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
694   const PetscInt    *r,*vj;
695   PetscInt          nz,k,idx;
696   const MatScalar   *aa=a->a,*v,*diag;
697   PetscScalar       *x,x0,x1,x2,x3,x4,*t,*tp;
698   const PetscScalar *b;
699 
700   PetscFunctionBegin;
701   PetscCall(VecGetArrayRead(bb,&b));
702   PetscCall(VecGetArray(xx,&x));
703   t    = a->solve_work;
704   PetscCall(ISGetIndices(isrow,&r));
705 
706   /* solve U^T * D * y = b by forward substitution */
707   tp = t;
708   for (k=0; k<mbs; k++) { /* t <- perm(b) */
709     idx   = 5*r[k];
710     tp[0] = b[idx];
711     tp[1] = b[idx+1];
712     tp[2] = b[idx+2];
713     tp[3] = b[idx+3];
714     tp[4] = b[idx+4];
715     tp   += 5;
716   }
717 
718   for (k=0; k<mbs; k++) {
719     v  = aa + 25*ai[k];
720     vj = aj + ai[k];
721     tp = t + k*5;
722     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
723     nz = ai[k+1] - ai[k];
724 
725     tp = t + (*vj)*5;
726     while (nz--) {
727       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
728       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
729       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
730       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
731       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
732       vj++;
733       tp = t + (*vj)*5;
734       v += 25;
735     }
736 
737     /* xk = inv(Dk)*(Dk*xk) */
738     diag  = aa+k*25;          /* ptr to inv(Dk) */
739     tp    = t + k*5;
740     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
741     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
742     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
743     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
744     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
745   }
746 
747   /* solve U*x = y by back substitution */
748   for (k=mbs-1; k>=0; k--) {
749     v  = aa + 25*ai[k];
750     vj = aj + ai[k];
751     tp = t + k*5;
752     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
753     nz = ai[k+1] - ai[k];
754 
755     tp = t + (*vj)*5;
756     while (nz--) {
757       /* xk += U(k,:)*x(:) */
758       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
759       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
760       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
761       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
762       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
763       vj++;
764       tp = t + (*vj)*5;
765       v += 25;
766     }
767     tp       = t + k*5;
768     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
769     idx      = 5*r[k];
770     x[idx]   = x0;
771     x[idx+1] = x1;
772     x[idx+2] = x2;
773     x[idx+3] = x3;
774     x[idx+4] = x4;
775   }
776 
777   PetscCall(ISRestoreIndices(isrow,&r));
778   PetscCall(VecRestoreArrayRead(bb,&b));
779   PetscCall(VecRestoreArray(xx,&x));
780   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs));
781   PetscFunctionReturn(0);
782 }
783 
784 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
785 {
786   const MatScalar *v,*diag;
787   PetscScalar     *xp,x0,x1,x2,x3,x4;
788   PetscInt        nz,k;
789   const PetscInt  *vj;
790 
791   PetscFunctionBegin;
792   for (k=0; k<mbs; k++) {
793     v  = aa + 25*ai[k];
794     xp = x + k*5;
795     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
796     nz = ai[k+1] - ai[k];
797     vj = aj + ai[k];
798     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
799     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
800     while (nz--) {
801       xp = x + (*vj)*5;
802       /* x(:) += U(k,:)^T*(Dk*xk) */
803       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
804       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
805       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
806       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
807       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
808       vj++;
809       v += 25;
810     }
811     /* xk = inv(Dk)*(Dk*xk) */
812     diag  = aa+k*25;         /* ptr to inv(Dk) */
813     xp    = x + k*5;
814     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
815     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
816     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
817     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
818     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
819   }
820   PetscFunctionReturn(0);
821 }
822 
823 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
824 {
825   const MatScalar *v;
826   PetscScalar     *xp,x0,x1,x2,x3,x4;
827   PetscInt        nz,k;
828   const PetscInt  *vj;
829 
830   PetscFunctionBegin;
831   for (k=mbs-1; k>=0; k--) {
832     v  = aa + 25*ai[k];
833     xp = x + k*5;
834     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
835     nz = ai[k+1] - ai[k];
836     vj = aj + ai[k];
837     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
838     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
839     while (nz--) {
840       xp = x + (*vj)*5;
841       /* xk += U(k,:)*x(:) */
842       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
843       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
844       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
845       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
846       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
847       vj++;
848       v += 25;
849     }
850     xp   = x + k*5;
851     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
852   }
853   PetscFunctionReturn(0);
854 }
855 
856 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
857 {
858   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
859   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
860   const MatScalar   *aa=a->a;
861   PetscScalar       *x;
862   const PetscScalar *b;
863 
864   PetscFunctionBegin;
865   PetscCall(VecGetArrayRead(bb,&b));
866   PetscCall(VecGetArray(xx,&x));
867 
868   /* solve U^T * D * y = b by forward substitution */
869   PetscCall(PetscArraycpy(x,b,5*mbs)); /* x <- b */
870   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x));
871 
872   /* solve U*x = y by back substitution */
873   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x));
874 
875   PetscCall(VecRestoreArrayRead(bb,&b));
876   PetscCall(VecRestoreArray(xx,&x));
877   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs));
878   PetscFunctionReturn(0);
879 }
880 
881 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
882 {
883   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
884   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
885   const MatScalar   *aa=a->a;
886   PetscScalar       *x;
887   const PetscScalar *b;
888 
889   PetscFunctionBegin;
890   PetscCall(VecGetArrayRead(bb,&b));
891   PetscCall(VecGetArray(xx,&x));
892   PetscCall(PetscArraycpy(x,b,5*mbs)); /* x <- b */
893   PetscCall(MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x));
894   PetscCall(VecRestoreArrayRead(bb,&b));
895   PetscCall(VecRestoreArray(xx,&x));
896   PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs));
897   PetscFunctionReturn(0);
898 }
899 
900 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
901 {
902   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
903   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
904   const MatScalar   *aa=a->a;
905   PetscScalar       *x;
906   const PetscScalar *b;
907 
908   PetscFunctionBegin;
909   PetscCall(VecGetArrayRead(bb,&b));
910   PetscCall(VecGetArray(xx,&x));
911   PetscCall(PetscArraycpy(x,b,5*mbs));
912   PetscCall(MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x));
913   PetscCall(VecRestoreArrayRead(bb,&b));
914   PetscCall(VecRestoreArray(xx,&x));
915   PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz-mbs)));
916   PetscFunctionReturn(0);
917 }
918 
919 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
920 {
921   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
922   IS                isrow=a->row;
923   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
924   const PetscInt    *r,*vj;
925   PetscInt          nz,k,idx;
926   const MatScalar   *aa=a->a,*v,*diag;
927   PetscScalar       *x,x0,x1,x2,x3,*t,*tp;
928   const PetscScalar *b;
929 
930   PetscFunctionBegin;
931   PetscCall(VecGetArrayRead(bb,&b));
932   PetscCall(VecGetArray(xx,&x));
933   t    = a->solve_work;
934   PetscCall(ISGetIndices(isrow,&r));
935 
936   /* solve U^T * D * y = b by forward substitution */
937   tp = t;
938   for (k=0; k<mbs; k++) { /* t <- perm(b) */
939     idx   = 4*r[k];
940     tp[0] = b[idx];
941     tp[1] = b[idx+1];
942     tp[2] = b[idx+2];
943     tp[3] = b[idx+3];
944     tp   += 4;
945   }
946 
947   for (k=0; k<mbs; k++) {
948     v  = aa + 16*ai[k];
949     vj = aj + ai[k];
950     tp = t + k*4;
951     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
952     nz = ai[k+1] - ai[k];
953 
954     tp = t + (*vj)*4;
955     while (nz--) {
956       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
957       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
958       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
959       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
960       vj++;
961       tp = t + (*vj)*4;
962       v += 16;
963     }
964 
965     /* xk = inv(Dk)*(Dk*xk) */
966     diag  = aa+k*16;          /* ptr to inv(Dk) */
967     tp    = t + k*4;
968     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
969     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
970     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
971     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
972   }
973 
974   /* solve U*x = y by back substitution */
975   for (k=mbs-1; k>=0; k--) {
976     v  = aa + 16*ai[k];
977     vj = aj + ai[k];
978     tp = t + k*4;
979     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
980     nz = ai[k+1] - ai[k];
981 
982     tp = t + (*vj)*4;
983     while (nz--) {
984       /* xk += U(k,:)*x(:) */
985       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
986       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
987       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
988       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
989       vj++;
990       tp = t + (*vj)*4;
991       v += 16;
992     }
993     tp       = t + k*4;
994     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
995     idx      = 4*r[k];
996     x[idx]   = x0;
997     x[idx+1] = x1;
998     x[idx+2] = x2;
999     x[idx+3] = x3;
1000   }
1001 
1002   PetscCall(ISRestoreIndices(isrow,&r));
1003   PetscCall(VecRestoreArrayRead(bb,&b));
1004   PetscCall(VecRestoreArray(xx,&x));
1005   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs));
1006   PetscFunctionReturn(0);
1007 }
1008 
1009 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1010 {
1011   const MatScalar *v,*diag;
1012   PetscScalar     *xp,x0,x1,x2,x3;
1013   PetscInt        nz,k;
1014   const PetscInt  *vj;
1015 
1016   PetscFunctionBegin;
1017   for (k=0; k<mbs; k++) {
1018     v  = aa + 16*ai[k];
1019     xp = x + k*4;
1020     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1021     nz = ai[k+1] - ai[k];
1022     vj = aj + ai[k];
1023     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1024     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1025     while (nz--) {
1026       xp = x + (*vj)*4;
1027       /* x(:) += U(k,:)^T*(Dk*xk) */
1028       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1029       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1030       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1031       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1032       vj++;
1033       v += 16;
1034     }
1035     /* xk = inv(Dk)*(Dk*xk) */
1036     diag  = aa+k*16;         /* ptr to inv(Dk) */
1037     xp    = x + k*4;
1038     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1039     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1040     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1041     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1042   }
1043   PetscFunctionReturn(0);
1044 }
1045 
1046 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1047 {
1048   const MatScalar *v;
1049   PetscScalar     *xp,x0,x1,x2,x3;
1050   PetscInt        nz,k;
1051   const PetscInt  *vj;
1052 
1053   PetscFunctionBegin;
1054   for (k=mbs-1; k>=0; k--) {
1055     v  = aa + 16*ai[k];
1056     xp = x + k*4;
1057     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1058     nz = ai[k+1] - ai[k];
1059     vj = aj + ai[k];
1060     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1061     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1062     while (nz--) {
1063       xp = x + (*vj)*4;
1064       /* xk += U(k,:)*x(:) */
1065       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1066       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1067       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1068       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1069       vj++;
1070       v += 16;
1071     }
1072     xp    = x + k*4;
1073     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1074   }
1075   PetscFunctionReturn(0);
1076 }
1077 
1078 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1079 {
1080   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1081   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1082   const MatScalar   *aa=a->a;
1083   PetscScalar       *x;
1084   const PetscScalar *b;
1085 
1086   PetscFunctionBegin;
1087   PetscCall(VecGetArrayRead(bb,&b));
1088   PetscCall(VecGetArray(xx,&x));
1089 
1090   /* solve U^T * D * y = b by forward substitution */
1091   PetscCall(PetscArraycpy(x,b,4*mbs)); /* x <- b */
1092   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x));
1093 
1094   /* solve U*x = y by back substitution */
1095   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x));
1096   PetscCall(VecRestoreArrayRead(bb,&b));
1097   PetscCall(VecRestoreArray(xx,&x));
1098   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs));
1099   PetscFunctionReturn(0);
1100 }
1101 
1102 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1103 {
1104   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1105   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1106   const MatScalar   *aa=a->a;
1107   PetscScalar       *x;
1108   const PetscScalar *b;
1109 
1110   PetscFunctionBegin;
1111   PetscCall(VecGetArrayRead(bb,&b));
1112   PetscCall(VecGetArray(xx,&x));
1113   PetscCall(PetscArraycpy(x,b,4*mbs)); /* x <- b */
1114   PetscCall(MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x));
1115   PetscCall(VecRestoreArrayRead(bb,&b));
1116   PetscCall(VecRestoreArray(xx,&x));
1117   PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs));
1118   PetscFunctionReturn(0);
1119 }
1120 
1121 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1122 {
1123   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1124   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1125   const MatScalar   *aa=a->a;
1126   PetscScalar       *x;
1127   const PetscScalar *b;
1128 
1129   PetscFunctionBegin;
1130   PetscCall(VecGetArrayRead(bb,&b));
1131   PetscCall(VecGetArray(xx,&x));
1132   PetscCall(PetscArraycpy(x,b,4*mbs));
1133   PetscCall(MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x));
1134   PetscCall(VecRestoreArrayRead(bb,&b));
1135   PetscCall(VecRestoreArray(xx,&x));
1136   PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz-mbs)));
1137   PetscFunctionReturn(0);
1138 }
1139 
1140 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1141 {
1142   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1143   IS                isrow=a->row;
1144   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1145   const PetscInt    *r;
1146   PetscInt          nz,k,idx;
1147   const PetscInt    *vj;
1148   const MatScalar   *aa=a->a,*v,*diag;
1149   PetscScalar       *x,x0,x1,x2,*t,*tp;
1150   const PetscScalar *b;
1151 
1152   PetscFunctionBegin;
1153   PetscCall(VecGetArrayRead(bb,&b));
1154   PetscCall(VecGetArray(xx,&x));
1155   t    = a->solve_work;
1156   PetscCall(ISGetIndices(isrow,&r));
1157 
1158   /* solve U^T * D * y = b by forward substitution */
1159   tp = t;
1160   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1161     idx   = 3*r[k];
1162     tp[0] = b[idx];
1163     tp[1] = b[idx+1];
1164     tp[2] = b[idx+2];
1165     tp   += 3;
1166   }
1167 
1168   for (k=0; k<mbs; k++) {
1169     v  = aa + 9*ai[k];
1170     vj = aj + ai[k];
1171     tp = t + k*3;
1172     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1173     nz = ai[k+1] - ai[k];
1174 
1175     tp = t + (*vj)*3;
1176     while (nz--) {
1177       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1178       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1179       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1180       vj++;
1181       tp = t + (*vj)*3;
1182       v += 9;
1183     }
1184 
1185     /* xk = inv(Dk)*(Dk*xk) */
1186     diag  = aa+k*9;          /* ptr to inv(Dk) */
1187     tp    = t + k*3;
1188     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1189     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1190     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1191   }
1192 
1193   /* solve U*x = y by back substitution */
1194   for (k=mbs-1; k>=0; k--) {
1195     v  = aa + 9*ai[k];
1196     vj = aj + ai[k];
1197     tp = t + k*3;
1198     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1199     nz = ai[k+1] - ai[k];
1200 
1201     tp = t + (*vj)*3;
1202     while (nz--) {
1203       /* xk += U(k,:)*x(:) */
1204       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1205       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1206       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1207       vj++;
1208       tp = t + (*vj)*3;
1209       v += 9;
1210     }
1211     tp       = t + k*3;
1212     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1213     idx      = 3*r[k];
1214     x[idx]   = x0;
1215     x[idx+1] = x1;
1216     x[idx+2] = x2;
1217   }
1218 
1219   PetscCall(ISRestoreIndices(isrow,&r));
1220   PetscCall(VecRestoreArrayRead(bb,&b));
1221   PetscCall(VecRestoreArray(xx,&x));
1222   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs));
1223   PetscFunctionReturn(0);
1224 }
1225 
1226 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1227 {
1228   const MatScalar *v,*diag;
1229   PetscScalar     *xp,x0,x1,x2;
1230   PetscInt        nz,k;
1231   const PetscInt  *vj;
1232 
1233   PetscFunctionBegin;
1234   for (k=0; k<mbs; k++) {
1235     v  = aa + 9*ai[k];
1236     xp = x + k*3;
1237     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1238     nz = ai[k+1] - ai[k];
1239     vj = aj + ai[k];
1240     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1241     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1242     while (nz--) {
1243       xp = x + (*vj)*3;
1244       /* x(:) += U(k,:)^T*(Dk*xk) */
1245       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1246       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1247       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1248       vj++;
1249       v += 9;
1250     }
1251     /* xk = inv(Dk)*(Dk*xk) */
1252     diag  = aa+k*9;         /* ptr to inv(Dk) */
1253     xp    = x + k*3;
1254     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1255     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1256     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1257   }
1258   PetscFunctionReturn(0);
1259 }
1260 
1261 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1262 {
1263   const MatScalar *v;
1264   PetscScalar     *xp,x0,x1,x2;
1265   PetscInt        nz,k;
1266   const PetscInt  *vj;
1267 
1268   PetscFunctionBegin;
1269   for (k=mbs-1; k>=0; k--) {
1270     v  = aa + 9*ai[k];
1271     xp = x + k*3;
1272     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1273     nz = ai[k+1] - ai[k];
1274     vj = aj + ai[k];
1275     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1276     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1277     while (nz--) {
1278       xp = x + (*vj)*3;
1279       /* xk += U(k,:)*x(:) */
1280       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1281       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1282       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1283       vj++;
1284       v += 9;
1285     }
1286     xp    = x + k*3;
1287     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1288   }
1289   PetscFunctionReturn(0);
1290 }
1291 
1292 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1293 {
1294   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1295   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1296   const MatScalar   *aa=a->a;
1297   PetscScalar       *x;
1298   const PetscScalar *b;
1299 
1300   PetscFunctionBegin;
1301   PetscCall(VecGetArrayRead(bb,&b));
1302   PetscCall(VecGetArray(xx,&x));
1303 
1304   /* solve U^T * D * y = b by forward substitution */
1305   PetscCall(PetscArraycpy(x,b,3*mbs));
1306   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x));
1307 
1308   /* solve U*x = y by back substitution */
1309   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x));
1310 
1311   PetscCall(VecRestoreArrayRead(bb,&b));
1312   PetscCall(VecRestoreArray(xx,&x));
1313   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs));
1314   PetscFunctionReturn(0);
1315 }
1316 
1317 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1318 {
1319   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1320   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1321   const MatScalar   *aa=a->a;
1322   PetscScalar       *x;
1323   const PetscScalar *b;
1324 
1325   PetscFunctionBegin;
1326   PetscCall(VecGetArrayRead(bb,&b));
1327   PetscCall(VecGetArray(xx,&x));
1328   PetscCall(PetscArraycpy(x,b,3*mbs));
1329   PetscCall(MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x));
1330   PetscCall(VecRestoreArrayRead(bb,&b));
1331   PetscCall(VecRestoreArray(xx,&x));
1332   PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs));
1333   PetscFunctionReturn(0);
1334 }
1335 
1336 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1337 {
1338   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1339   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1340   const MatScalar   *aa=a->a;
1341   PetscScalar       *x;
1342   const PetscScalar *b;
1343 
1344   PetscFunctionBegin;
1345   PetscCall(VecGetArrayRead(bb,&b));
1346   PetscCall(VecGetArray(xx,&x));
1347   PetscCall(PetscArraycpy(x,b,3*mbs));
1348   PetscCall(MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x));
1349   PetscCall(VecRestoreArrayRead(bb,&b));
1350   PetscCall(VecRestoreArray(xx,&x));
1351   PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz-mbs)));
1352   PetscFunctionReturn(0);
1353 }
1354 
1355 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1356 {
1357   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
1358   IS                isrow=a->row;
1359   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1360   const PetscInt    *r,*vj;
1361   PetscInt          nz,k,k2,idx;
1362   const MatScalar   *aa=a->a,*v,*diag;
1363   PetscScalar       *x,x0,x1,*t;
1364   const PetscScalar *b;
1365 
1366   PetscFunctionBegin;
1367   PetscCall(VecGetArrayRead(bb,&b));
1368   PetscCall(VecGetArray(xx,&x));
1369   t    = a->solve_work;
1370   PetscCall(ISGetIndices(isrow,&r));
1371 
1372   /* solve U^T * D * y = perm(b) by forward substitution */
1373   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1374     idx      = 2*r[k];
1375     t[k*2]   = b[idx];
1376     t[k*2+1] = b[idx+1];
1377   }
1378   for (k=0; k<mbs; k++) {
1379     v  = aa + 4*ai[k];
1380     vj = aj + ai[k];
1381     k2 = k*2;
1382     x0 = t[k2]; x1 = t[k2+1];
1383     nz = ai[k+1] - ai[k];
1384     while (nz--) {
1385       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1386       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1387       vj++; v      += 4;
1388     }
1389     diag    = aa+k*4; /* ptr to inv(Dk) */
1390     t[k2]   = diag[0]*x0 + diag[2]*x1;
1391     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1392   }
1393 
1394   /* solve U*x = y by back substitution */
1395   for (k=mbs-1; k>=0; k--) {
1396     v  = aa + 4*ai[k];
1397     vj = aj + ai[k];
1398     k2 = k*2;
1399     x0 = t[k2]; x1 = t[k2+1];
1400     nz = ai[k+1] - ai[k];
1401     while (nz--) {
1402       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1403       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1404       vj++;
1405       v += 4;
1406     }
1407     t[k2]    = x0;
1408     t[k2+1]  = x1;
1409     idx      = 2*r[k];
1410     x[idx]   = x0;
1411     x[idx+1] = x1;
1412   }
1413 
1414   PetscCall(ISRestoreIndices(isrow,&r));
1415   PetscCall(VecRestoreArrayRead(bb,&b));
1416   PetscCall(VecRestoreArray(xx,&x));
1417   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs));
1418   PetscFunctionReturn(0);
1419 }
1420 
1421 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1422 {
1423   const MatScalar *v,*diag;
1424   PetscScalar     x0,x1;
1425   PetscInt        nz,k,k2;
1426   const PetscInt  *vj;
1427 
1428   PetscFunctionBegin;
1429   for (k=0; k<mbs; k++) {
1430     v  = aa + 4*ai[k];
1431     vj = aj + ai[k];
1432     k2 = k*2;
1433     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1434     nz = ai[k+1] - ai[k];
1435     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1436     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1437     while (nz--) {
1438       /* x(:) += U(k,:)^T*(Dk*xk) */
1439       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1440       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1441       vj++; v      += 4;
1442     }
1443     /* xk = inv(Dk)*(Dk*xk) */
1444     diag    = aa+k*4;       /* ptr to inv(Dk) */
1445     x[k2]   = diag[0]*x0 + diag[2]*x1;
1446     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1447   }
1448   PetscFunctionReturn(0);
1449 }
1450 
1451 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1452 {
1453   const MatScalar *v;
1454   PetscScalar     x0,x1;
1455   PetscInt        nz,k,k2;
1456   const PetscInt  *vj;
1457 
1458   PetscFunctionBegin;
1459   for (k=mbs-1; k>=0; k--) {
1460     v  = aa + 4*ai[k];
1461     vj = aj + ai[k];
1462     k2 = k*2;
1463     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1464     nz = ai[k+1] - ai[k];
1465     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1466     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1467     while (nz--) {
1468       /* xk += U(k,:)*x(:) */
1469       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1470       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1471       vj++;
1472       v += 4;
1473     }
1474     x[k2]   = x0;
1475     x[k2+1] = x1;
1476   }
1477   PetscFunctionReturn(0);
1478 }
1479 
1480 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1481 {
1482   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1483   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1484   const MatScalar   *aa=a->a;
1485   PetscScalar       *x;
1486   const PetscScalar *b;
1487 
1488   PetscFunctionBegin;
1489   PetscCall(VecGetArrayRead(bb,&b));
1490   PetscCall(VecGetArray(xx,&x));
1491 
1492   /* solve U^T * D * y = b by forward substitution */
1493   PetscCall(PetscArraycpy(x,b,2*mbs));
1494   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x));
1495 
1496   /* solve U*x = y by back substitution */
1497   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x));
1498 
1499   PetscCall(VecRestoreArrayRead(bb,&b));
1500   PetscCall(VecRestoreArray(xx,&x));
1501   PetscCall(PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs));
1502   PetscFunctionReturn(0);
1503 }
1504 
1505 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1506 {
1507   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1508   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1509   const MatScalar   *aa=a->a;
1510   PetscScalar       *x;
1511   const PetscScalar *b;
1512 
1513   PetscFunctionBegin;
1514   PetscCall(VecGetArrayRead(bb,&b));
1515   PetscCall(VecGetArray(xx,&x));
1516   PetscCall(PetscArraycpy(x,b,2*mbs));
1517   PetscCall(MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x));
1518   PetscCall(VecRestoreArrayRead(bb,&b));
1519   PetscCall(VecRestoreArray(xx,&x));
1520   PetscCall(PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs));
1521   PetscFunctionReturn(0);
1522 }
1523 
1524 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1525 {
1526   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1527   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1528   const MatScalar   *aa=a->a;
1529   PetscScalar       *x;
1530   const PetscScalar *b;
1531 
1532   PetscFunctionBegin;
1533   PetscCall(VecGetArrayRead(bb,&b));
1534   PetscCall(VecGetArray(xx,&x));
1535   PetscCall(PetscArraycpy(x,b,2*mbs));
1536   PetscCall(MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x));
1537   PetscCall(VecRestoreArrayRead(bb,&b));
1538   PetscCall(VecRestoreArray(xx,&x));
1539   PetscCall(PetscLogFlops(2.0*a->bs2*(a->nz - mbs)));
1540   PetscFunctionReturn(0);
1541 }
1542 
1543 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1544 {
1545   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1546   IS                isrow=a->row;
1547   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1548   const MatScalar   *aa=a->a,*v;
1549   const PetscScalar *b;
1550   PetscScalar       *x,xk,*t;
1551   PetscInt          nz,k,j;
1552 
1553   PetscFunctionBegin;
1554   PetscCall(VecGetArrayRead(bb,&b));
1555   PetscCall(VecGetArray(xx,&x));
1556   t    = a->solve_work;
1557   PetscCall(ISGetIndices(isrow,&rp));
1558 
1559   /* solve U^T*D*y = perm(b) by forward substitution */
1560   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1561   for (k=0; k<mbs; k++) {
1562     v  = aa + ai[k];
1563     vj = aj + ai[k];
1564     xk = t[k];
1565     nz = ai[k+1] - ai[k] - 1;
1566     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1567     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1568   }
1569 
1570   /* solve U*perm(x) = y by back substitution */
1571   for (k=mbs-1; k>=0; k--) {
1572     v  = aa + adiag[k] - 1;
1573     vj = aj + adiag[k] - 1;
1574     nz = ai[k+1] - ai[k] - 1;
1575     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1576     x[rp[k]] = t[k];
1577   }
1578 
1579   PetscCall(ISRestoreIndices(isrow,&rp));
1580   PetscCall(VecRestoreArrayRead(bb,&b));
1581   PetscCall(VecRestoreArray(xx,&x));
1582   PetscCall(PetscLogFlops(4.0*a->nz - 3.0*mbs));
1583   PetscFunctionReturn(0);
1584 }
1585 
1586 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1587 {
1588   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1589   IS                isrow=a->row;
1590   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1591   const MatScalar   *aa=a->a,*v;
1592   PetscScalar       *x,xk,*t;
1593   const PetscScalar *b;
1594   PetscInt          nz,k;
1595 
1596   PetscFunctionBegin;
1597   PetscCall(VecGetArrayRead(bb,&b));
1598   PetscCall(VecGetArray(xx,&x));
1599   t    = a->solve_work;
1600   PetscCall(ISGetIndices(isrow,&rp));
1601 
1602   /* solve U^T*D*y = perm(b) by forward substitution */
1603   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1604   for (k=0; k<mbs; k++) {
1605     v  = aa + ai[k] + 1;
1606     vj = aj + ai[k] + 1;
1607     xk = t[k];
1608     nz = ai[k+1] - ai[k] - 1;
1609     while (nz--) t[*vj++] += (*v++) * xk;
1610     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1611   }
1612 
1613   /* solve U*perm(x) = y by back substitution */
1614   for (k=mbs-1; k>=0; k--) {
1615     v  = aa + ai[k] + 1;
1616     vj = aj + ai[k] + 1;
1617     nz = ai[k+1] - ai[k] - 1;
1618     while (nz--) t[k] += (*v++) * t[*vj++];
1619     x[rp[k]] = t[k];
1620   }
1621 
1622   PetscCall(ISRestoreIndices(isrow,&rp));
1623   PetscCall(VecRestoreArrayRead(bb,&b));
1624   PetscCall(VecRestoreArray(xx,&x));
1625   PetscCall(PetscLogFlops(4.0*a->nz - 3*mbs));
1626   PetscFunctionReturn(0);
1627 }
1628 
1629 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1630 {
1631   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1632   IS                isrow=a->row;
1633   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1634   const MatScalar   *aa=a->a,*v;
1635   PetscReal         diagk;
1636   PetscScalar       *x,xk;
1637   const PetscScalar *b;
1638   PetscInt          nz,k;
1639 
1640   PetscFunctionBegin;
1641   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1642   PetscCall(VecGetArrayRead(bb,&b));
1643   PetscCall(VecGetArray(xx,&x));
1644   PetscCall(ISGetIndices(isrow,&rp));
1645 
1646   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1647   for (k=0; k<mbs; k++) {
1648     v  = aa + ai[k];
1649     vj = aj + ai[k];
1650     xk = x[k];
1651     nz = ai[k+1] - ai[k] - 1;
1652     while (nz--) x[*vj++] += (*v++) * xk;
1653 
1654     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1655     PetscCheckFalse(PetscImaginaryPart(aa[adiag[k]]) || diagk < 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1656     x[k] = xk*PetscSqrtReal(diagk);
1657   }
1658   PetscCall(ISRestoreIndices(isrow,&rp));
1659   PetscCall(VecRestoreArrayRead(bb,&b));
1660   PetscCall(VecRestoreArray(xx,&x));
1661   PetscCall(PetscLogFlops(2.0*a->nz - mbs));
1662   PetscFunctionReturn(0);
1663 }
1664 
1665 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1666 {
1667   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1668   IS                isrow=a->row;
1669   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1670   const MatScalar   *aa=a->a,*v;
1671   PetscReal         diagk;
1672   PetscScalar       *x,xk;
1673   const PetscScalar *b;
1674   PetscInt          nz,k;
1675 
1676   PetscFunctionBegin;
1677   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1678   PetscCall(VecGetArrayRead(bb,&b));
1679   PetscCall(VecGetArray(xx,&x));
1680   PetscCall(ISGetIndices(isrow,&rp));
1681 
1682   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1683   for (k=0; k<mbs; k++) {
1684     v  = aa + ai[k] + 1;
1685     vj = aj + ai[k] + 1;
1686     xk = x[k];
1687     nz = ai[k+1] - ai[k] - 1;
1688     while (nz--) x[*vj++] += (*v++) * xk;
1689 
1690     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1691     PetscCheckFalse(PetscImaginaryPart(aa[ai[k]]) || diagk < 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1692     x[k] = xk*PetscSqrtReal(diagk);
1693   }
1694   PetscCall(ISRestoreIndices(isrow,&rp));
1695   PetscCall(VecRestoreArrayRead(bb,&b));
1696   PetscCall(VecRestoreArray(xx,&x));
1697   PetscCall(PetscLogFlops(2.0*a->nz - mbs));
1698   PetscFunctionReturn(0);
1699 }
1700 
1701 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1702 {
1703   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1704   IS                isrow=a->row;
1705   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1706   const MatScalar   *aa=a->a,*v;
1707   PetscReal         diagk;
1708   PetscScalar       *x,*t;
1709   const PetscScalar *b;
1710   PetscInt          nz,k;
1711 
1712   PetscFunctionBegin;
1713   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1714   PetscCall(VecGetArrayRead(bb,&b));
1715   PetscCall(VecGetArray(xx,&x));
1716   t    = a->solve_work;
1717   PetscCall(ISGetIndices(isrow,&rp));
1718 
1719   for (k=mbs-1; k>=0; k--) {
1720     v     = aa + ai[k];
1721     vj    = aj + ai[k];
1722     diagk = PetscRealPart(aa[adiag[k]]);
1723     PetscCheckFalse(PetscImaginaryPart(aa[adiag[k]]) || diagk < 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1724     t[k] = b[k] * PetscSqrtReal(diagk);
1725     nz   = ai[k+1] - ai[k] - 1;
1726     while (nz--) t[k] += (*v++) * t[*vj++];
1727     x[rp[k]] = t[k];
1728   }
1729   PetscCall(ISRestoreIndices(isrow,&rp));
1730   PetscCall(VecRestoreArrayRead(bb,&b));
1731   PetscCall(VecRestoreArray(xx,&x));
1732   PetscCall(PetscLogFlops(2.0*a->nz - mbs));
1733   PetscFunctionReturn(0);
1734 }
1735 
1736 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1737 {
1738   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1739   IS                isrow=a->row;
1740   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1741   const MatScalar   *aa=a->a,*v;
1742   PetscReal         diagk;
1743   PetscScalar       *x,*t;
1744   const PetscScalar *b;
1745   PetscInt          nz,k;
1746 
1747   PetscFunctionBegin;
1748   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1749   PetscCall(VecGetArrayRead(bb,&b));
1750   PetscCall(VecGetArray(xx,&x));
1751   t    = a->solve_work;
1752   PetscCall(ISGetIndices(isrow,&rp));
1753 
1754   for (k=mbs-1; k>=0; k--) {
1755     v     = aa + ai[k] + 1;
1756     vj    = aj + ai[k] + 1;
1757     diagk = PetscRealPart(aa[ai[k]]);
1758     PetscCheckFalse(PetscImaginaryPart(aa[ai[k]]) || diagk < 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1759     t[k] = b[k] * PetscSqrtReal(diagk);
1760     nz   = ai[k+1] - ai[k] - 1;
1761     while (nz--) t[k] += (*v++) * t[*vj++];
1762     x[rp[k]] = t[k];
1763   }
1764   PetscCall(ISRestoreIndices(isrow,&rp));
1765   PetscCall(VecRestoreArrayRead(bb,&b));
1766   PetscCall(VecRestoreArray(xx,&x));
1767   PetscCall(PetscLogFlops(2.0*a->nz - mbs));
1768   PetscFunctionReturn(0);
1769 }
1770 
1771 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1772 {
1773   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1774 
1775   PetscFunctionBegin;
1776   if (A->rmap->bs == 1) {
1777     PetscCall(MatSolve_SeqSBAIJ_1(A,bb->v,xx->v));
1778   } else {
1779     IS                isrow=a->row;
1780     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1781     const MatScalar   *aa=a->a,*v;
1782     PetscScalar       *x,*t;
1783     const PetscScalar *b;
1784     PetscInt          nz,k,n,i,j;
1785 
1786     if (bb->n > a->solves_work_n) {
1787       PetscCall(PetscFree(a->solves_work));
1788       PetscCall(PetscMalloc1(bb->n*A->rmap->N,&a->solves_work));
1789       a->solves_work_n = bb->n;
1790     }
1791     n    = bb->n;
1792     PetscCall(VecGetArrayRead(bb->v,&b));
1793     PetscCall(VecGetArray(xx->v,&x));
1794     t    = a->solves_work;
1795 
1796     PetscCall(ISGetIndices(isrow,&rp));
1797 
1798     /* solve U^T*D*y = perm(b) by forward substitution */
1799     for (k=0; k<mbs; k++) {
1800       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1801     }
1802     for (k=0; k<mbs; k++) {
1803       v  = aa + ai[k];
1804       vj = aj + ai[k];
1805       nz = ai[k+1] - ai[k] - 1;
1806       for (j=0; j<nz; j++) {
1807         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1808         v++;vj++;
1809       }
1810       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1811     }
1812 
1813     /* solve U*perm(x) = y by back substitution */
1814     for (k=mbs-1; k>=0; k--) {
1815       v  = aa + ai[k] - 1;
1816       vj = aj + ai[k] - 1;
1817       nz = ai[k+1] - ai[k] - 1;
1818       for (j=0; j<nz; j++) {
1819         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1820         v++;vj++;
1821       }
1822       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1823     }
1824 
1825     PetscCall(ISRestoreIndices(isrow,&rp));
1826     PetscCall(VecRestoreArrayRead(bb->v,&b));
1827     PetscCall(VecRestoreArray(xx->v,&x));
1828     PetscCall(PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs)));
1829   }
1830   PetscFunctionReturn(0);
1831 }
1832 
1833 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1834 {
1835   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1836 
1837   PetscFunctionBegin;
1838   if (A->rmap->bs == 1) {
1839     PetscCall(MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v));
1840   } else {
1841     IS                isrow=a->row;
1842     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1843     const MatScalar   *aa=a->a,*v;
1844     PetscScalar       *x,*t;
1845     const PetscScalar *b;
1846     PetscInt          nz,k,n,i;
1847 
1848     if (bb->n > a->solves_work_n) {
1849       PetscCall(PetscFree(a->solves_work));
1850       PetscCall(PetscMalloc1(bb->n*A->rmap->N,&a->solves_work));
1851       a->solves_work_n = bb->n;
1852     }
1853     n    = bb->n;
1854     PetscCall(VecGetArrayRead(bb->v,&b));
1855     PetscCall(VecGetArray(xx->v,&x));
1856     t    = a->solves_work;
1857 
1858     PetscCall(ISGetIndices(isrow,&rp));
1859 
1860     /* solve U^T*D*y = perm(b) by forward substitution */
1861     for (k=0; k<mbs; k++) {
1862       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
1863     }
1864     for (k=0; k<mbs; k++) {
1865       v  = aa + ai[k];
1866       vj = aj + ai[k];
1867       nz = ai[k+1] - ai[k];
1868       while (nz--) {
1869         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1870         v++;vj++;
1871       }
1872       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1873     }
1874 
1875     /* solve U*perm(x) = y by back substitution */
1876     for (k=mbs-1; k>=0; k--) {
1877       v  = aa + ai[k];
1878       vj = aj + ai[k];
1879       nz = ai[k+1] - ai[k];
1880       while (nz--) {
1881         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1882         v++;vj++;
1883       }
1884       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1885     }
1886 
1887     PetscCall(ISRestoreIndices(isrow,&rp));
1888     PetscCall(VecRestoreArrayRead(bb->v,&b));
1889     PetscCall(VecRestoreArray(xx->v,&x));
1890     PetscCall(PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs)));
1891   }
1892   PetscFunctionReturn(0);
1893 }
1894 
1895 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1896 {
1897   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1898   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1899   const MatScalar   *aa=a->a,*v;
1900   const PetscScalar *b;
1901   PetscScalar       *x,xi;
1902   PetscInt          nz,i,j;
1903 
1904   PetscFunctionBegin;
1905   PetscCall(VecGetArrayRead(bb,&b));
1906   PetscCall(VecGetArray(xx,&x));
1907   /* solve U^T*D*y = b by forward substitution */
1908   PetscCall(PetscArraycpy(x,b,mbs));
1909   for (i=0; i<mbs; i++) {
1910     v  = aa + ai[i];
1911     vj = aj + ai[i];
1912     xi = x[i];
1913     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1914     for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1915     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1916   }
1917   /* solve U*x = y by backward substitution */
1918   for (i=mbs-2; i>=0; i--) {
1919     xi = x[i];
1920     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1921     vj = aj + adiag[i] - 1;
1922     nz = ai[i+1] - ai[i] - 1;
1923     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1924     x[i] = xi;
1925   }
1926   PetscCall(VecRestoreArrayRead(bb,&b));
1927   PetscCall(VecRestoreArray(xx,&x));
1928   PetscCall(PetscLogFlops(4.0*a->nz - 3*mbs));
1929   PetscFunctionReturn(0);
1930 }
1931 
1932 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)
1933 {
1934   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1935   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1936   const MatScalar   *aa=a->a,*v;
1937   const PetscScalar *b;
1938   PetscScalar       *x,xi;
1939   PetscInt          nz,i,j,neq,ldb,ldx;
1940   PetscBool         isdense;
1941 
1942   PetscFunctionBegin;
1943   if (!mbs) PetscFunctionReturn(0);
1944   PetscCall(PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense));
1945   PetscCheck(isdense,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix");
1946   if (X != B) {
1947     PetscCall(PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense));
1948     PetscCheck(isdense,PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix");
1949   }
1950   PetscCall(MatDenseGetArrayRead(B,&b));
1951   PetscCall(MatDenseGetLDA(B,&ldb));
1952   PetscCall(MatDenseGetArray(X,&x));
1953   PetscCall(MatDenseGetLDA(X,&ldx));
1954   for (neq=0; neq<B->cmap->n; neq++) {
1955     /* solve U^T*D*y = b by forward substitution */
1956     PetscCall(PetscArraycpy(x,b,mbs));
1957     for (i=0; i<mbs; i++) {
1958       v  = aa + ai[i];
1959       vj = aj + ai[i];
1960       xi = x[i];
1961       nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1962       for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1963       x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1964     }
1965     /* solve U*x = y by backward substitution */
1966     for (i=mbs-2; i>=0; i--) {
1967       xi = x[i];
1968       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1969       vj = aj + adiag[i] - 1;
1970       nz = ai[i+1] - ai[i] - 1;
1971       for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1972       x[i] = xi;
1973     }
1974     b += ldb;
1975     x += ldx;
1976   }
1977   PetscCall(MatDenseRestoreArrayRead(B,&b));
1978   PetscCall(MatDenseRestoreArray(X,&x));
1979   PetscCall(PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs)));
1980   PetscFunctionReturn(0);
1981 }
1982 
1983 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1984 {
1985   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1986   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
1987   const MatScalar   *aa=a->a,*v;
1988   PetscScalar       *x,xk;
1989   const PetscScalar *b;
1990   PetscInt          nz,k;
1991 
1992   PetscFunctionBegin;
1993   PetscCall(VecGetArrayRead(bb,&b));
1994   PetscCall(VecGetArray(xx,&x));
1995 
1996   /* solve U^T*D*y = b by forward substitution */
1997   PetscCall(PetscArraycpy(x,b,mbs));
1998   for (k=0; k<mbs; k++) {
1999     v  = aa + ai[k] + 1;
2000     vj = aj + ai[k] + 1;
2001     xk = x[k];
2002     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2003     while (nz--) x[*vj++] += (*v++) * xk;
2004     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2005   }
2006 
2007   /* solve U*x = y by back substitution */
2008   for (k=mbs-2; k>=0; k--) {
2009     v  = aa + ai[k] + 1;
2010     vj = aj + ai[k] + 1;
2011     xk = x[k];
2012     nz = ai[k+1] - ai[k] - 1;
2013     while (nz--) {
2014       xk += (*v++) * x[*vj++];
2015     }
2016     x[k] = xk;
2017   }
2018 
2019   PetscCall(VecRestoreArrayRead(bb,&b));
2020   PetscCall(VecRestoreArray(xx,&x));
2021   PetscCall(PetscLogFlops(4.0*a->nz - 3*mbs));
2022   PetscFunctionReturn(0);
2023 }
2024 
2025 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2026 {
2027   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2028   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2029   const MatScalar   *aa=a->a,*v;
2030   PetscReal         diagk;
2031   PetscScalar       *x;
2032   const PetscScalar *b;
2033   PetscInt          nz,k;
2034 
2035   PetscFunctionBegin;
2036   /* solve U^T*D^(1/2)*x = b by forward substitution */
2037   PetscCall(VecGetArrayRead(bb,&b));
2038   PetscCall(VecGetArray(xx,&x));
2039   PetscCall(PetscArraycpy(x,b,mbs));
2040   for (k=0; k<mbs; k++) {
2041     v  = aa + ai[k];
2042     vj = aj + ai[k];
2043     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2044     while (nz--) x[*vj++] += (*v++) * x[k];
2045     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2046     PetscCheck(!PetscImaginaryPart(aa[adiag[k]]) && diagk >= 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",(double)PetscRealPart(aa[adiag[k]]),(double)PetscImaginaryPart(aa[adiag[k]]));
2047     x[k] *= PetscSqrtReal(diagk);
2048   }
2049   PetscCall(VecRestoreArrayRead(bb,&b));
2050   PetscCall(VecRestoreArray(xx,&x));
2051   PetscCall(PetscLogFlops(2.0*a->nz - mbs));
2052   PetscFunctionReturn(0);
2053 }
2054 
2055 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2056 {
2057   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2058   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2059   const MatScalar   *aa=a->a,*v;
2060   PetscReal         diagk;
2061   PetscScalar       *x;
2062   const PetscScalar *b;
2063   PetscInt          nz,k;
2064 
2065   PetscFunctionBegin;
2066   /* solve U^T*D^(1/2)*x = b by forward substitution */
2067   PetscCall(VecGetArrayRead(bb,&b));
2068   PetscCall(VecGetArray(xx,&x));
2069   PetscCall(PetscArraycpy(x,b,mbs));
2070   for (k=0; k<mbs; k++) {
2071     v  = aa + ai[k] + 1;
2072     vj = aj + ai[k] + 1;
2073     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2074     while (nz--) x[*vj++] += (*v++) * x[k];
2075     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2076     PetscCheck(!PetscImaginaryPart(aa[ai[k]]) && diagk >= 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",(double)PetscRealPart(aa[ai[k]]),(double)PetscImaginaryPart(aa[ai[k]]));
2077     x[k] *= PetscSqrtReal(diagk);
2078   }
2079   PetscCall(VecRestoreArrayRead(bb,&b));
2080   PetscCall(VecRestoreArray(xx,&x));
2081   PetscCall(PetscLogFlops(2.0*a->nz - mbs));
2082   PetscFunctionReturn(0);
2083 }
2084 
2085 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2086 {
2087   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2088   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2089   const MatScalar   *aa=a->a,*v;
2090   PetscReal         diagk;
2091   PetscScalar       *x;
2092   const PetscScalar *b;
2093   PetscInt          nz,k;
2094 
2095   PetscFunctionBegin;
2096   /* solve D^(1/2)*U*x = b by back substitution */
2097   PetscCall(VecGetArrayRead(bb,&b));
2098   PetscCall(VecGetArray(xx,&x));
2099 
2100   for (k=mbs-1; k>=0; k--) {
2101     v     = aa + ai[k];
2102     vj    = aj + ai[k];
2103     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2104     PetscCheckFalse(PetscImaginaryPart(aa[adiag[k]]) || diagk < 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2105     x[k] = PetscSqrtReal(diagk)*b[k];
2106     nz   = ai[k+1] - ai[k] - 1;
2107     while (nz--) x[k] += (*v++) * x[*vj++];
2108   }
2109   PetscCall(VecRestoreArrayRead(bb,&b));
2110   PetscCall(VecRestoreArray(xx,&x));
2111   PetscCall(PetscLogFlops(2.0*a->nz - mbs));
2112   PetscFunctionReturn(0);
2113 }
2114 
2115 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2116 {
2117   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2118   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2119   const MatScalar   *aa=a->a,*v;
2120   PetscReal         diagk;
2121   PetscScalar       *x;
2122   const PetscScalar *b;
2123   PetscInt          nz,k;
2124 
2125   PetscFunctionBegin;
2126   /* solve D^(1/2)*U*x = b by back substitution */
2127   PetscCall(VecGetArrayRead(bb,&b));
2128   PetscCall(VecGetArray(xx,&x));
2129 
2130   for (k=mbs-1; k>=0; k--) {
2131     v     = aa + ai[k] + 1;
2132     vj    = aj + ai[k] + 1;
2133     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2134     PetscCheckFalse(PetscImaginaryPart(aa[ai[k]]) || diagk < 0,PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2135     x[k] = PetscSqrtReal(diagk)*b[k];
2136     nz   = ai[k+1] - ai[k] - 1;
2137     while (nz--) x[k] += (*v++) * x[*vj++];
2138   }
2139   PetscCall(VecRestoreArrayRead(bb,&b));
2140   PetscCall(VecRestoreArray(xx,&x));
2141   PetscCall(PetscLogFlops(2.0*a->nz - mbs));
2142   PetscFunctionReturn(0);
2143 }
2144 
2145 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2146 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2147 {
2148   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2149   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2150   PetscInt       *jutmp,bs = A->rmap->bs,i;
2151   PetscInt       m,reallocs = 0,*levtmp;
2152   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2153   PetscInt       incrlev,*lev,shift,prow,nz;
2154   PetscReal      f = info->fill,levels = info->levels;
2155   PetscBool      perm_identity;
2156 
2157   PetscFunctionBegin;
2158   /* check whether perm is the identity mapping */
2159   PetscCall(ISIdentity(perm,&perm_identity));
2160 
2161   if (perm_identity) {
2162     a->permute = PETSC_FALSE;
2163     ai         = a->i; aj = a->j;
2164   } else { /*  non-trivial permutation */
2165     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2166   }
2167 
2168   /* initialization */
2169   PetscCall(ISGetIndices(perm,&rip));
2170   umax  = (PetscInt)(f*ai[mbs] + 1);
2171   PetscCall(PetscMalloc1(umax,&lev));
2172   umax += mbs + 1;
2173   shift = mbs + 1;
2174   PetscCall(PetscMalloc1(mbs+1,&iu));
2175   PetscCall(PetscMalloc1(umax,&ju));
2176   iu[0] = mbs + 1;
2177   juidx = mbs + 1;
2178   /* prowl: linked list for pivot row */
2179   PetscCall(PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp));
2180   /* q: linked list for col index */
2181 
2182   for (i=0; i<mbs; i++) {
2183     prowl[i] = mbs;
2184     q[i]     = 0;
2185   }
2186 
2187   /* for each row k */
2188   for (k=0; k<mbs; k++) {
2189     nzk  = 0;
2190     q[k] = mbs;
2191     /* copy current row into linked list */
2192     nz = ai[rip[k]+1] - ai[rip[k]];
2193     j  = ai[rip[k]];
2194     while (nz--) {
2195       vj = rip[aj[j++]];
2196       if (vj > k) {
2197         qm = k;
2198         do {
2199           m = qm; qm = q[m];
2200         } while (qm < vj);
2201         PetscCheck(qm != vj,PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A");
2202         nzk++;
2203         q[m]       = vj;
2204         q[vj]      = qm;
2205         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2206       }
2207     }
2208 
2209     /* modify nonzero structure of k-th row by computing fill-in
2210        for each row prow to be merged in */
2211     prow = k;
2212     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2213 
2214     while (prow < k) {
2215       /* merge row prow into k-th row */
2216       jmin = iu[prow] + 1;
2217       jmax = iu[prow+1];
2218       qm   = k;
2219       for (j=jmin; j<jmax; j++) {
2220         incrlev = lev[j-shift] + 1;
2221         if (incrlev > levels) continue;
2222         vj = ju[j];
2223         do {
2224           m = qm; qm = q[m];
2225         } while (qm < vj);
2226         if (qm != vj) {      /* a fill */
2227           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2228           levtmp[vj] = incrlev;
2229         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2230       }
2231       prow = prowl[prow]; /* next pivot row */
2232     }
2233 
2234     /* add k to row list for first nonzero element in k-th row */
2235     if (nzk > 1) {
2236       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2237       prowl[k] = prowl[i]; prowl[i] = k;
2238     }
2239     iu[k+1] = iu[k] + nzk;
2240 
2241     /* allocate more space to ju and lev if needed */
2242     if (iu[k+1] > umax) {
2243       /* estimate how much additional space we will need */
2244       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2245       /* just double the memory each time */
2246       maxadd = umax;
2247       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2248       umax += maxadd;
2249 
2250       /* allocate a longer ju */
2251       PetscCall(PetscMalloc1(umax,&jutmp));
2252       PetscCall(PetscArraycpy(jutmp,ju,iu[k]));
2253       PetscCall(PetscFree(ju));
2254       ju   = jutmp;
2255 
2256       PetscCall(PetscMalloc1(umax,&jutmp));
2257       PetscCall(PetscArraycpy(jutmp,lev,iu[k]-shift));
2258       PetscCall(PetscFree(lev));
2259       lev       = jutmp;
2260       reallocs += 2; /* count how many times we realloc */
2261     }
2262 
2263     /* save nonzero structure of k-th row in ju */
2264     i=k;
2265     while (nzk--) {
2266       i                = q[i];
2267       ju[juidx]        = i;
2268       lev[juidx-shift] = levtmp[i];
2269       juidx++;
2270     }
2271   }
2272 
2273 #if defined(PETSC_USE_INFO)
2274   if (ai[mbs] != 0) {
2275     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2276     PetscCall(PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af));
2277     PetscCall(PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af));
2278     PetscCall(PetscInfo(A,"PCFactorSetFill(pc,%g);\n",(double)af));
2279     PetscCall(PetscInfo(A,"for best performance.\n"));
2280   } else {
2281     PetscCall(PetscInfo(A,"Empty matrix\n"));
2282   }
2283 #endif
2284 
2285   PetscCall(ISRestoreIndices(perm,&rip));
2286   PetscCall(PetscFree3(prowl,q,levtmp));
2287   PetscCall(PetscFree(lev));
2288 
2289   /* put together the new matrix */
2290   PetscCall(MatSeqSBAIJSetPreallocation(B,bs,0,NULL));
2291 
2292   /* PetscCall(PetscLogObjectParent((PetscObject)B,(PetscObject)iperm)); */
2293   b    = (Mat_SeqSBAIJ*)(B)->data;
2294   PetscCall(PetscFree2(b->imax,b->ilen));
2295 
2296   b->singlemalloc = PETSC_FALSE;
2297   b->free_a       = PETSC_TRUE;
2298   b->free_ij      = PETSC_TRUE;
2299   /* the next line frees the default space generated by the Create() */
2300   PetscCall(PetscFree3(b->a,b->j,b->i));
2301   PetscCall(PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a));
2302   b->j    = ju;
2303   b->i    = iu;
2304   b->diag = NULL;
2305   b->ilen = NULL;
2306   b->imax = NULL;
2307 
2308   PetscCall(ISDestroy(&b->row));
2309   PetscCall(ISDestroy(&b->icol));
2310   b->row  = perm;
2311   b->icol = perm;
2312   PetscCall(PetscObjectReference((PetscObject)perm));
2313   PetscCall(PetscObjectReference((PetscObject)perm));
2314   PetscCall(PetscMalloc1(bs*mbs+bs,&b->solve_work));
2315   /* In b structure:  Free imax, ilen, old a, old j.
2316      Allocate idnew, solve_work, new a, new j */
2317   PetscCall(PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar))));
2318   b->maxnz = b->nz = iu[mbs];
2319 
2320   (B)->info.factor_mallocs   = reallocs;
2321   (B)->info.fill_ratio_given = f;
2322   if (ai[mbs] != 0) {
2323     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2324   } else {
2325     (B)->info.fill_ratio_needed = 0.0;
2326   }
2327   PetscCall(MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity));
2328   PetscFunctionReturn(0);
2329 }
2330 
2331 /*
2332   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2333 */
2334 #include <petscbt.h>
2335 #include <../src/mat/utils/freespace.h>
2336 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2337 {
2338   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2339   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2340   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2341   const PetscInt     *rip;
2342   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2343   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2344   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2345   PetscReal          fill          =info->fill,levels=info->levels;
2346   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2347   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2348   PetscBT            lnkbt;
2349 
2350   PetscFunctionBegin;
2351   PetscCheck(A->rmap->n == A->cmap->n,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %" PetscInt_FMT " columns %" PetscInt_FMT,A->rmap->n,A->cmap->n);
2352   PetscCall(MatMissingDiagonal(A,&missing,&d));
2353   PetscCheck(!missing,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %" PetscInt_FMT,d);
2354   if (bs > 1) {
2355     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info));
2356     PetscFunctionReturn(0);
2357   }
2358 
2359   /* check whether perm is the identity mapping */
2360   PetscCall(ISIdentity(perm,&perm_identity));
2361   PetscCheck(perm_identity,PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2362   a->permute = PETSC_FALSE;
2363 
2364   PetscCall(PetscMalloc1(am+1,&ui));
2365   PetscCall(PetscMalloc1(am+1,&udiag));
2366   ui[0] = 0;
2367 
2368   /* ICC(0) without matrix ordering: simply rearrange column indices */
2369   if (!levels) {
2370     /* reuse the column pointers and row offsets for memory savings */
2371     for (i=0; i<am; i++) {
2372       ncols    = ai[i+1] - ai[i];
2373       ui[i+1]  = ui[i] + ncols;
2374       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2375     }
2376     PetscCall(PetscMalloc1(ui[am]+1,&uj));
2377     cols = uj;
2378     for (i=0; i<am; i++) {
2379       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2380       ncols = ai[i+1] - ai[i] -1;
2381       for (j=0; j<ncols; j++) *cols++ = aj[j];
2382       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2383     }
2384   } else { /* case: levels>0 */
2385     PetscCall(ISGetIndices(perm,&rip));
2386 
2387     /* initialization */
2388     /* jl: linked list for storing indices of the pivot rows
2389        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2390     PetscCall(PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl));
2391     for (i=0; i<am; i++) {
2392       jl[i] = am; il[i] = 0;
2393     }
2394 
2395     /* create and initialize a linked list for storing column indices of the active row k */
2396     nlnk = am + 1;
2397     PetscCall(PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt));
2398 
2399     /* initial FreeSpace size is fill*(ai[am]+1) */
2400     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space));
2401 
2402     current_space = free_space;
2403 
2404     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl));
2405 
2406     current_space_lvl = free_space_lvl;
2407 
2408     for (k=0; k<am; k++) {  /* for each active row k */
2409       /* initialize lnk by the column indices of row k */
2410       nzk   = 0;
2411       ncols = ai[k+1] - ai[k];
2412       PetscCheck(ncols,PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %" PetscInt_FMT " in matrix ",k);
2413       cols = aj+ai[k];
2414       PetscCall(PetscIncompleteLLInit(ncols,cols,am,rip,&nlnk,lnk,lnk_lvl,lnkbt));
2415       nzk += nlnk;
2416 
2417       /* update lnk by computing fill-in for each pivot row to be merged in */
2418       prow = jl[k]; /* 1st pivot row */
2419 
2420       while (prow < k) {
2421         nextprow = jl[prow];
2422 
2423         /* merge prow into k-th row */
2424         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2425         jmax  = ui[prow+1];
2426         ncols = jmax-jmin;
2427         i     = jmin - ui[prow];
2428 
2429         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2430         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2431         j    = *(uj - 1);
2432         PetscCall(PetscICCLLAddSorted(ncols,cols,levels,uj,am,&nlnk,lnk,lnk_lvl,lnkbt,j));
2433         nzk += nlnk;
2434 
2435         /* update il and jl for prow */
2436         if (jmin < jmax) {
2437           il[prow] = jmin;
2438 
2439           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2440         }
2441         prow = nextprow;
2442       }
2443 
2444       /* if free space is not available, make more free space */
2445       if (current_space->local_remaining<nzk) {
2446         i    = am - k + 1; /* num of unfactored rows */
2447         i    = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2448         PetscCall(PetscFreeSpaceGet(i,&current_space));
2449         PetscCall(PetscFreeSpaceGet(i,&current_space_lvl));
2450         reallocs++;
2451       }
2452 
2453       /* copy data into free_space and free_space_lvl, then initialize lnk */
2454       PetscCheck(nzk != 0,PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %" PetscInt_FMT " in ICC matrix factor",k);
2455       PetscCall(PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt));
2456 
2457       /* add the k-th row into il and jl */
2458       if (nzk > 1) {
2459         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2460         jl[k] = jl[i]; jl[i] = k;
2461         il[k] = ui[k] + 1;
2462       }
2463       uj_ptr[k]     = current_space->array;
2464       uj_lvl_ptr[k] = current_space_lvl->array;
2465 
2466       current_space->array               += nzk;
2467       current_space->local_used          += nzk;
2468       current_space->local_remaining     -= nzk;
2469       current_space_lvl->array           += nzk;
2470       current_space_lvl->local_used      += nzk;
2471       current_space_lvl->local_remaining -= nzk;
2472 
2473       ui[k+1] = ui[k] + nzk;
2474     }
2475 
2476     PetscCall(ISRestoreIndices(perm,&rip));
2477     PetscCall(PetscFree4(uj_ptr,uj_lvl_ptr,il,jl));
2478 
2479     /* destroy list of free space and other temporary array(s) */
2480     PetscCall(PetscMalloc1(ui[am]+1,&uj));
2481     PetscCall(PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag)); /* store matrix factor  */
2482     PetscCall(PetscIncompleteLLDestroy(lnk,lnkbt));
2483     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2484 
2485   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2486 
2487   /* put together the new matrix in MATSEQSBAIJ format */
2488   PetscCall(MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL));
2489 
2490   b    = (Mat_SeqSBAIJ*)(fact)->data;
2491   PetscCall(PetscFree2(b->imax,b->ilen));
2492 
2493   b->singlemalloc = PETSC_FALSE;
2494   b->free_a       = PETSC_TRUE;
2495   b->free_ij      = free_ij;
2496 
2497   PetscCall(PetscMalloc1(ui[am]+1,&b->a));
2498 
2499   b->j         = uj;
2500   b->i         = ui;
2501   b->diag      = udiag;
2502   b->free_diag = PETSC_TRUE;
2503   b->ilen      = NULL;
2504   b->imax      = NULL;
2505   b->row       = perm;
2506   b->col       = perm;
2507 
2508   PetscCall(PetscObjectReference((PetscObject)perm));
2509   PetscCall(PetscObjectReference((PetscObject)perm));
2510 
2511   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2512 
2513   PetscCall(PetscMalloc1(am+1,&b->solve_work));
2514   PetscCall(PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar))));
2515 
2516   b->maxnz = b->nz = ui[am];
2517 
2518   fact->info.factor_mallocs   = reallocs;
2519   fact->info.fill_ratio_given = fill;
2520   if (ai[am] != 0) {
2521     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2522   } else {
2523     fact->info.fill_ratio_needed = 0.0;
2524   }
2525 #if defined(PETSC_USE_INFO)
2526   if (ai[am] != 0) {
2527     PetscReal af = fact->info.fill_ratio_needed;
2528     PetscCall(PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af));
2529     PetscCall(PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af));
2530     PetscCall(PetscInfo(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af));
2531   } else {
2532     PetscCall(PetscInfo(A,"Empty matrix\n"));
2533   }
2534 #endif
2535   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2536   PetscFunctionReturn(0);
2537 }
2538 
2539 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2540 {
2541   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2542   Mat_SeqSBAIJ       *b;
2543   PetscBool          perm_identity,free_ij = PETSC_TRUE;
2544   PetscInt           bs=A->rmap->bs,am=a->mbs;
2545   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2546   PetscInt           reallocs=0,i,*ui;
2547   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2548   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2549   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2550   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2551   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2552   PetscBT            lnkbt;
2553 
2554   PetscFunctionBegin;
2555   /*
2556    This code originally uses Modified Sparse Row (MSR) storage
2557    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2558    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2559    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2560    thus the original code in MSR format is still used for these cases.
2561    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2562    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2563   */
2564   if (bs > 1) {
2565     PetscCall(MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info));
2566     PetscFunctionReturn(0);
2567   }
2568 
2569   /* check whether perm is the identity mapping */
2570   PetscCall(ISIdentity(perm,&perm_identity));
2571   PetscCheck(perm_identity,PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2572   a->permute = PETSC_FALSE;
2573 
2574   /* special case that simply copies fill pattern */
2575   if (!levels) {
2576     /* reuse the column pointers and row offsets for memory savings */
2577     ui           = a->i;
2578     uj           = a->j;
2579     free_ij      = PETSC_FALSE;
2580     ratio_needed = 1.0;
2581   } else { /* case: levels>0 */
2582     PetscCall(ISGetIndices(perm,&rip));
2583 
2584     /* initialization */
2585     PetscCall(PetscMalloc1(am+1,&ui));
2586     ui[0] = 0;
2587 
2588     /* jl: linked list for storing indices of the pivot rows
2589        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2590     PetscCall(PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl));
2591     for (i=0; i<am; i++) {
2592       jl[i] = am; il[i] = 0;
2593     }
2594 
2595     /* create and initialize a linked list for storing column indices of the active row k */
2596     nlnk = am + 1;
2597     PetscCall(PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt));
2598 
2599     /* initial FreeSpace size is fill*(ai[am]+1) */
2600     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space));
2601 
2602     current_space = free_space;
2603 
2604     PetscCall(PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl));
2605 
2606     current_space_lvl = free_space_lvl;
2607 
2608     for (k=0; k<am; k++) {  /* for each active row k */
2609       /* initialize lnk by the column indices of row rip[k] */
2610       nzk   = 0;
2611       ncols = ai[rip[k]+1] - ai[rip[k]];
2612       cols  = aj+ai[rip[k]];
2613       PetscCall(PetscIncompleteLLInit(ncols,cols,am,rip,&nlnk,lnk,lnk_lvl,lnkbt));
2614       nzk  += nlnk;
2615 
2616       /* update lnk by computing fill-in for each pivot row to be merged in */
2617       prow = jl[k]; /* 1st pivot row */
2618 
2619       while (prow < k) {
2620         nextprow = jl[prow];
2621 
2622         /* merge prow into k-th row */
2623         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2624         jmax     = ui[prow+1];
2625         ncols    = jmax-jmin;
2626         i        = jmin - ui[prow];
2627         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2628         j        = *(uj_lvl_ptr[prow] + i - 1);
2629         cols_lvl = uj_lvl_ptr[prow]+i;
2630         PetscCall(PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,&nlnk,lnk,lnk_lvl,lnkbt,j));
2631         nzk     += nlnk;
2632 
2633         /* update il and jl for prow */
2634         if (jmin < jmax) {
2635           il[prow] = jmin;
2636           j        = *cols;
2637           jl[prow] = jl[j];
2638           jl[j]    = prow;
2639         }
2640         prow = nextprow;
2641       }
2642 
2643       /* if free space is not available, make more free space */
2644       if (current_space->local_remaining<nzk) {
2645         i    = am - k + 1; /* num of unfactored rows */
2646         i    = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2647         PetscCall(PetscFreeSpaceGet(i,&current_space));
2648         PetscCall(PetscFreeSpaceGet(i,&current_space_lvl));
2649         reallocs++;
2650       }
2651 
2652       /* copy data into free_space and free_space_lvl, then initialize lnk */
2653       PetscCall(PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt));
2654 
2655       /* add the k-th row into il and jl */
2656       if (nzk-1 > 0) {
2657         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2658         jl[k] = jl[i]; jl[i] = k;
2659         il[k] = ui[k] + 1;
2660       }
2661       uj_ptr[k]     = current_space->array;
2662       uj_lvl_ptr[k] = current_space_lvl->array;
2663 
2664       current_space->array               += nzk;
2665       current_space->local_used          += nzk;
2666       current_space->local_remaining     -= nzk;
2667       current_space_lvl->array           += nzk;
2668       current_space_lvl->local_used      += nzk;
2669       current_space_lvl->local_remaining -= nzk;
2670 
2671       ui[k+1] = ui[k] + nzk;
2672     }
2673 
2674     PetscCall(ISRestoreIndices(perm,&rip));
2675     PetscCall(PetscFree4(uj_ptr,uj_lvl_ptr,il,jl));
2676 
2677     /* destroy list of free space and other temporary array(s) */
2678     PetscCall(PetscMalloc1(ui[am]+1,&uj));
2679     PetscCall(PetscFreeSpaceContiguous(&free_space,uj));
2680     PetscCall(PetscIncompleteLLDestroy(lnk,lnkbt));
2681     PetscCall(PetscFreeSpaceDestroy(free_space_lvl));
2682     if (ai[am] != 0) {
2683       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2684     } else {
2685       ratio_needed = 0.0;
2686     }
2687   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2688 
2689   /* put together the new matrix in MATSEQSBAIJ format */
2690   PetscCall(MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL));
2691 
2692   b = (Mat_SeqSBAIJ*)(fact)->data;
2693 
2694   PetscCall(PetscFree2(b->imax,b->ilen));
2695 
2696   b->singlemalloc = PETSC_FALSE;
2697   b->free_a       = PETSC_TRUE;
2698   b->free_ij      = free_ij;
2699 
2700   PetscCall(PetscMalloc1(ui[am]+1,&b->a));
2701 
2702   b->j             = uj;
2703   b->i             = ui;
2704   b->diag          = NULL;
2705   b->ilen          = NULL;
2706   b->imax          = NULL;
2707   b->row           = perm;
2708   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2709 
2710   PetscCall(PetscObjectReference((PetscObject)perm));
2711   b->icol = perm;
2712   PetscCall(PetscObjectReference((PetscObject)perm));
2713   PetscCall(PetscMalloc1(am+1,&b->solve_work));
2714 
2715   b->maxnz = b->nz = ui[am];
2716 
2717   fact->info.factor_mallocs    = reallocs;
2718   fact->info.fill_ratio_given  = fill;
2719   fact->info.fill_ratio_needed = ratio_needed;
2720 #if defined(PETSC_USE_INFO)
2721   if (ai[am] != 0) {
2722     PetscReal af = fact->info.fill_ratio_needed;
2723     PetscCall(PetscInfo(A,"Reallocs %" PetscInt_FMT " Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af));
2724     PetscCall(PetscInfo(A,"Run with -pc_factor_fill %g or use \n",(double)af));
2725     PetscCall(PetscInfo(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af));
2726   } else {
2727     PetscCall(PetscInfo(A,"Empty matrix\n"));
2728   }
2729 #endif
2730   if (perm_identity) {
2731     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2732   } else {
2733     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2734   }
2735   PetscFunctionReturn(0);
2736 }
2737