xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision efa12513287cff49a2b9648ae83199dcbfaad71a)
1 
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include <../src/mat/impls/sbaij/seq/sbaij.h>
7 #include <../src/mat/impls/baij/seq/baij.h>
8 #include <petsc/private/kernels/blockinvert.h>
9 
10 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
11 {
12   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
13   IS                isrow=a->row;
14   PetscInt          mbs  =a->mbs,*ai=a->i,*aj=a->j;
15   PetscErrorCode    ierr;
16   const PetscInt    *r;
17   PetscInt          nz,*vj,k,idx,k1;
18   PetscInt          bs =A->rmap->bs,bs2 = a->bs2;
19   const MatScalar   *aa=a->a,*v,*diag;
20   PetscScalar       *x,*xk,*xj,*xk_tmp,*t;
21   const PetscScalar *b;
22 
23   PetscFunctionBegin;
24   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
25   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26   t    = a->solve_work;
27   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
28   ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr);
29 
30   /* solve U^T * D * y = b by forward substitution */
31   xk = t;
32   for (k=0; k<mbs; k++) { /* t <- perm(b) */
33     idx = bs*r[k];
34     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
35   }
36   for (k=0; k<mbs; k++) {
37     v    = aa + bs2*ai[k];
38     xk   = t + k*bs;    /* Dk*xk = k-th block of x */
39     ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */
40     nz   = ai[k+1] - ai[k];
41     vj   = aj + ai[k];
42     xj   = t + (*vj)*bs; /* *vj-th block of x, *vj>k */
43     while (nz--) {
44       /* x(:) += U(k,:)^T*(Dk*xk) */
45       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
46       vj++; xj = t + (*vj)*bs;
47       v       += bs2;
48     }
49     /* xk = inv(Dk)*(Dk*xk) */
50     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
51     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
52   }
53 
54   /* solve U*x = y by back substitution */
55   for (k=mbs-1; k>=0; k--) {
56     v  = aa + bs2*ai[k];
57     xk = t + k*bs;        /* xk */
58     nz = ai[k+1] - ai[k];
59     vj = aj + ai[k];
60     xj = t + (*vj)*bs;
61     while (nz--) {
62       /* xk += U(k,:)*x(:) */
63       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
64       vj++;
65       v += bs2; xj = t + (*vj)*bs;
66     }
67     idx = bs*r[k];
68     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
69   }
70 
71   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
72   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
73   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
74   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
75   ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr);
76   PetscFunctionReturn(0);
77 }
78 
79 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
80 {
81   PetscFunctionBegin;
82   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"not implemented yet");
83 }
84 
85 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
86 {
87   PetscFunctionBegin;
88   SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Not yet implemented");
89 }
90 
91 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
92 {
93   PetscErrorCode  ierr;
94   PetscInt        nz,k;
95   const PetscInt  *vj,bs2 = bs*bs;
96   const MatScalar *v,*diag;
97   PetscScalar     *xk,*xj,*xk_tmp;
98 
99   PetscFunctionBegin;
100   ierr = PetscMalloc1(bs,&xk_tmp);CHKERRQ(ierr);
101   for (k=0; k<mbs; k++) {
102     v    = aa + bs2*ai[k];
103     xk   = x + k*bs;    /* Dk*xk = k-th block of x */
104     ierr = PetscArraycpy(xk_tmp,xk,bs);CHKERRQ(ierr); /* xk_tmp <- xk */
105     nz   = ai[k+1] - ai[k];
106     vj   = aj + ai[k];
107     xj   = x + (*vj)*bs; /* *vj-th block of x, *vj>k */
108     while (nz--) {
109       /* x(:) += U(k,:)^T*(Dk*xk) */
110       PetscKernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
111       vj++; xj = x + (*vj)*bs;
112       v       += bs2;
113     }
114     /* xk = inv(Dk)*(Dk*xk) */
115     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
116     PetscKernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
117   }
118   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
119   PetscFunctionReturn(0);
120 }
121 
122 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
123 {
124   PetscInt        nz,k;
125   const PetscInt  *vj,bs2 = bs*bs;
126   const MatScalar *v;
127   PetscScalar     *xk,*xj;
128 
129   PetscFunctionBegin;
130   for (k=mbs-1; k>=0; k--) {
131     v  = aa + bs2*ai[k];
132     xk = x + k*bs;        /* xk */
133     nz = ai[k+1] - ai[k];
134     vj = aj + ai[k];
135     xj = x + (*vj)*bs;
136     while (nz--) {
137       /* xk += U(k,:)*x(:) */
138       PetscKernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
139       vj++;
140       v += bs2; xj = x + (*vj)*bs;
141     }
142   }
143   PetscFunctionReturn(0);
144 }
145 
146 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
147 {
148   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
149   PetscErrorCode    ierr;
150   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
151   PetscInt          bs =A->rmap->bs;
152   const MatScalar   *aa=a->a;
153   PetscScalar       *x;
154   const PetscScalar *b;
155 
156   PetscFunctionBegin;
157   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
158   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
159 
160   /* solve U^T * D * y = b by forward substitution */
161   ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */
162   ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
163 
164   /* solve U*x = y by back substitution */
165   ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
166 
167   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
168   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
169   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
170   PetscFunctionReturn(0);
171 }
172 
173 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
174 {
175   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
176   PetscErrorCode    ierr;
177   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
178   PetscInt          bs =A->rmap->bs;
179   const MatScalar   *aa=a->a;
180   const PetscScalar *b;
181   PetscScalar       *x;
182 
183   PetscFunctionBegin;
184   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
185   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
186   ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr); /* x <- b */
187   ierr = MatForwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
188   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
189   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
190   ierr = PetscLogFlops(2.0*a->bs2*a->nz - bs*mbs);CHKERRQ(ierr);
191   PetscFunctionReturn(0);
192 }
193 
194 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
195 {
196   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
197   PetscErrorCode    ierr;
198   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
199   PetscInt          bs =A->rmap->bs;
200   const MatScalar   *aa=a->a;
201   const PetscScalar *b;
202   PetscScalar       *x;
203 
204   PetscFunctionBegin;
205   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
206   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
207   ierr = PetscArraycpy(x,b,bs*mbs);CHKERRQ(ierr);
208   ierr = MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
209   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
210   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
211   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
212   PetscFunctionReturn(0);
213 }
214 
215 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
216 {
217   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
218   IS                isrow=a->row;
219   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
220   PetscErrorCode    ierr;
221   PetscInt          nz,k,idx;
222   const MatScalar   *aa=a->a,*v,*d;
223   PetscScalar       *x,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
224   const PetscScalar *b;
225 
226   PetscFunctionBegin;
227   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
228   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
229   t    = a->solve_work;
230   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
231 
232   /* solve U^T * D * y = b by forward substitution */
233   tp = t;
234   for (k=0; k<mbs; k++) { /* t <- perm(b) */
235     idx   = 7*r[k];
236     tp[0] = b[idx];
237     tp[1] = b[idx+1];
238     tp[2] = b[idx+2];
239     tp[3] = b[idx+3];
240     tp[4] = b[idx+4];
241     tp[5] = b[idx+5];
242     tp[6] = b[idx+6];
243     tp   += 7;
244   }
245 
246   for (k=0; k<mbs; k++) {
247     v  = aa + 49*ai[k];
248     vj = aj + ai[k];
249     tp = t + k*7;
250     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
251     nz = ai[k+1] - ai[k];
252     tp = t + (*vj)*7;
253     while (nz--) {
254       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
255       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
256       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
257       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
258       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
259       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
260       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
261       vj++;
262       tp = t + (*vj)*7;
263       v += 49;
264     }
265 
266     /* xk = inv(Dk)*(Dk*xk) */
267     d     = aa+k*49;       /* ptr to inv(Dk) */
268     tp    = t + k*7;
269     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
270     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
271     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
272     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
273     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
274     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
275     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
276   }
277 
278   /* solve U*x = y by back substitution */
279   for (k=mbs-1; k>=0; k--) {
280     v  = aa + 49*ai[k];
281     vj = aj + ai[k];
282     tp = t + k*7;
283     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
284     nz = ai[k+1] - ai[k];
285 
286     tp = t + (*vj)*7;
287     while (nz--) {
288       /* xk += U(k,:)*x(:) */
289       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
290       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
291       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
292       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
293       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
294       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
295       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
296       vj++;
297       tp = t + (*vj)*7;
298       v += 49;
299     }
300     tp       = t + k*7;
301     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
302     idx      = 7*r[k];
303     x[idx]   = x0;
304     x[idx+1] = x1;
305     x[idx+2] = x2;
306     x[idx+3] = x3;
307     x[idx+4] = x4;
308     x[idx+5] = x5;
309     x[idx+6] = x6;
310   }
311 
312   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
313   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
314   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
315   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
316   PetscFunctionReturn(0);
317 }
318 
319 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
320 {
321   const MatScalar *v,*d;
322   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
323   PetscInt        nz,k;
324   const PetscInt  *vj;
325 
326   PetscFunctionBegin;
327   for (k=0; k<mbs; k++) {
328     v  = aa + 49*ai[k];
329     xp = x + k*7;
330     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
331     nz = ai[k+1] - ai[k];
332     vj = aj + ai[k];
333     xp = x + (*vj)*7;
334     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
335     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
336     while (nz--) {
337       /* x(:) += U(k,:)^T*(Dk*xk) */
338       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
339       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
340       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
341       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
342       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
343       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
344       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
345       vj++;
346       xp = x + (*vj)*7;
347       v += 49;
348     }
349     /* xk = inv(Dk)*(Dk*xk) */
350     d     = aa+k*49;       /* ptr to inv(Dk) */
351     xp    = x + k*7;
352     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
353     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
354     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
355     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
356     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
357     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
358     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
359   }
360   PetscFunctionReturn(0);
361 }
362 
363 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
364 {
365   const MatScalar *v;
366   PetscScalar     *xp,x0,x1,x2,x3,x4,x5,x6;
367   PetscInt        nz,k;
368   const PetscInt  *vj;
369 
370   PetscFunctionBegin;
371   for (k=mbs-1; k>=0; k--) {
372     v  = aa + 49*ai[k];
373     xp = x + k*7;
374     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
375     nz = ai[k+1] - ai[k];
376     vj = aj + ai[k];
377     xp = x + (*vj)*7;
378     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
379     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
380     while (nz--) {
381       /* xk += U(k,:)*x(:) */
382       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
383       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
384       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
385       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
386       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
387       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
388       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
389       vj++;
390       v += 49; xp = x + (*vj)*7;
391     }
392     xp   = x + k*7;
393     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
394   }
395   PetscFunctionReturn(0);
396 }
397 
398 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
399 {
400   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
401   PetscErrorCode    ierr;
402   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
403   const MatScalar   *aa=a->a;
404   PetscScalar       *x;
405   const PetscScalar *b;
406 
407   PetscFunctionBegin;
408   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
409   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
410 
411   /* solve U^T * D * y = b by forward substitution */
412   ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr); /* x <- b */
413   ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
414 
415   /* solve U*x = y by back substitution */
416   ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
417 
418   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
419   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
420   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
421   PetscFunctionReturn(0);
422 }
423 
424 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
425 {
426   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
427   PetscErrorCode    ierr;
428   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
429   const MatScalar   *aa=a->a;
430   PetscScalar       *x;
431   const PetscScalar *b;
432 
433   PetscFunctionBegin;
434   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
435   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
436   ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr);
437   ierr = MatForwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
438   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
439   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
440   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
441   PetscFunctionReturn(0);
442 }
443 
444 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
445 {
446   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
447   PetscErrorCode    ierr;
448   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
449   const MatScalar   *aa=a->a;
450   PetscScalar       *x;
451   const PetscScalar *b;
452 
453   PetscFunctionBegin;
454   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
455   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
456   ierr = PetscArraycpy(x,b,7*mbs);CHKERRQ(ierr);
457   ierr = MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
458   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
459   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
460   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
461   PetscFunctionReturn(0);
462 }
463 
464 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
465 {
466   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
467   IS                isrow=a->row;
468   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j,*r,*vj;
469   PetscErrorCode    ierr;
470   PetscInt          nz,k,idx;
471   const MatScalar   *aa=a->a,*v,*d;
472   PetscScalar       *x,x0,x1,x2,x3,x4,x5,*t,*tp;
473   const PetscScalar *b;
474 
475   PetscFunctionBegin;
476   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
477   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
478   t    = a->solve_work;
479   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
480 
481   /* solve U^T * D * y = b by forward substitution */
482   tp = t;
483   for (k=0; k<mbs; k++) { /* t <- perm(b) */
484     idx   = 6*r[k];
485     tp[0] = b[idx];
486     tp[1] = b[idx+1];
487     tp[2] = b[idx+2];
488     tp[3] = b[idx+3];
489     tp[4] = b[idx+4];
490     tp[5] = b[idx+5];
491     tp   += 6;
492   }
493 
494   for (k=0; k<mbs; k++) {
495     v  = aa + 36*ai[k];
496     vj = aj + ai[k];
497     tp = t + k*6;
498     x0 =tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
499     nz = ai[k+1] - ai[k];
500     tp = t + (*vj)*6;
501     while (nz--) {
502       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
503       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
504       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
505       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
506       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
507       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
508       vj++;
509       tp = t + (*vj)*6;
510       v += 36;
511     }
512 
513     /* xk = inv(Dk)*(Dk*xk) */
514     d     = aa+k*36;       /* ptr to inv(Dk) */
515     tp    = t + k*6;
516     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
517     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
518     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
519     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
520     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
521     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
522   }
523 
524   /* solve U*x = y by back substitution */
525   for (k=mbs-1; k>=0; k--) {
526     v  = aa + 36*ai[k];
527     vj = aj + ai[k];
528     tp = t + k*6;
529     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; /* xk */
530     nz = ai[k+1] - ai[k];
531 
532     tp = t + (*vj)*6;
533     while (nz--) {
534       /* xk += U(k,:)*x(:) */
535       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
536       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
537       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
538       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
539       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
540       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
541       vj++;
542       tp = t + (*vj)*6;
543       v += 36;
544     }
545     tp       = t + k*6;
546     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
547     idx      = 6*r[k];
548     x[idx]   = x0;
549     x[idx+1] = x1;
550     x[idx+2] = x2;
551     x[idx+3] = x3;
552     x[idx+4] = x4;
553     x[idx+5] = x5;
554   }
555 
556   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
557   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
558   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
559   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
560   PetscFunctionReturn(0);
561 }
562 
563 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
564 {
565   const MatScalar *v,*d;
566   PetscScalar     *xp,x0,x1,x2,x3,x4,x5;
567   PetscInt        nz,k;
568   const PetscInt  *vj;
569 
570   PetscFunctionBegin;
571   for (k=0; k<mbs; k++) {
572     v  = aa + 36*ai[k];
573     xp = x + k*6;
574     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
575     nz = ai[k+1] - ai[k];
576     vj = aj + ai[k];
577     xp = x + (*vj)*6;
578     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
579     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
580     while (nz--) {
581       /* x(:) += U(k,:)^T*(Dk*xk) */
582       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
583       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
584       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
585       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
586       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
587       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
588       vj++;
589       xp = x + (*vj)*6;
590       v += 36;
591     }
592     /* xk = inv(Dk)*(Dk*xk) */
593     d     = aa+k*36;       /* ptr to inv(Dk) */
594     xp    = x + k*6;
595     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
596     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
597     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
598     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
599     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
600     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
601   }
602   PetscFunctionReturn(0);
603 }
604 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
605 {
606   const MatScalar   *v;
607   PetscScalar       *xp,x0,x1,x2,x3,x4,x5;
608   PetscInt          nz,k;
609   const PetscInt    *vj;
610 
611   PetscFunctionBegin;
612   for (k=mbs-1; k>=0; k--) {
613     v  = aa + 36*ai[k];
614     xp = x + k*6;
615     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
616     nz = ai[k+1] - ai[k];
617     vj = aj + ai[k];
618     xp = x + (*vj)*6;
619     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
620     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
621     while (nz--) {
622       /* xk += U(k,:)*x(:) */
623       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
624       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
625       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
626       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
627       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
628       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
629       vj++;
630       v += 36; xp = x + (*vj)*6;
631     }
632     xp   = x + k*6;
633     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
634   }
635   PetscFunctionReturn(0);
636 }
637 
638 
639 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
640 {
641   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
642   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
643   const MatScalar   *aa=a->a;
644   PetscScalar       *x;
645   const PetscScalar *b;
646   PetscErrorCode    ierr;
647 
648   PetscFunctionBegin;
649   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
650   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
651 
652   /* solve U^T * D * y = b by forward substitution */
653   ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */
654   ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
655 
656   /* solve U*x = y by back substitution */
657   ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
658 
659   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
660   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
661   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
662   PetscFunctionReturn(0);
663 }
664 
665 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
666 {
667   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
668   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
669   const MatScalar   *aa=a->a;
670   PetscScalar       *x;
671   const PetscScalar *b;
672   PetscErrorCode    ierr;
673 
674   PetscFunctionBegin;
675   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
676   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
677   ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */
678   ierr = MatForwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
679   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
680   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
681   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
682   PetscFunctionReturn(0);
683 }
684 
685 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
686 {
687   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
688   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
689   const MatScalar   *aa=a->a;
690   PetscScalar       *x;
691   const PetscScalar *b;
692   PetscErrorCode    ierr;
693 
694   PetscFunctionBegin;
695   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
696   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
697   ierr = PetscArraycpy(x,b,6*mbs);CHKERRQ(ierr); /* x <- b */
698   ierr = MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
699   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
700   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
701   ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr);
702   PetscFunctionReturn(0);
703 }
704 
705 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
706 {
707   Mat_SeqSBAIJ      *a=(Mat_SeqSBAIJ*)A->data;
708   IS                isrow=a->row;
709   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
710   PetscErrorCode    ierr;
711   const PetscInt    *r,*vj;
712   PetscInt          nz,k,idx;
713   const MatScalar   *aa=a->a,*v,*diag;
714   PetscScalar       *x,x0,x1,x2,x3,x4,*t,*tp;
715   const PetscScalar *b;
716 
717   PetscFunctionBegin;
718   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
719   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
720   t    = a->solve_work;
721   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
722 
723   /* solve U^T * D * y = b by forward substitution */
724   tp = t;
725   for (k=0; k<mbs; k++) { /* t <- perm(b) */
726     idx   = 5*r[k];
727     tp[0] = b[idx];
728     tp[1] = b[idx+1];
729     tp[2] = b[idx+2];
730     tp[3] = b[idx+3];
731     tp[4] = b[idx+4];
732     tp   += 5;
733   }
734 
735   for (k=0; k<mbs; k++) {
736     v  = aa + 25*ai[k];
737     vj = aj + ai[k];
738     tp = t + k*5;
739     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
740     nz = ai[k+1] - ai[k];
741 
742     tp = t + (*vj)*5;
743     while (nz--) {
744       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
745       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
746       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
747       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
748       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
749       vj++;
750       tp = t + (*vj)*5;
751       v += 25;
752     }
753 
754     /* xk = inv(Dk)*(Dk*xk) */
755     diag  = aa+k*25;          /* ptr to inv(Dk) */
756     tp    = t + k*5;
757     tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
758     tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
759     tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
760     tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
761     tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
762   }
763 
764   /* solve U*x = y by back substitution */
765   for (k=mbs-1; k>=0; k--) {
766     v  = aa + 25*ai[k];
767     vj = aj + ai[k];
768     tp = t + k*5;
769     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; /* xk */
770     nz = ai[k+1] - ai[k];
771 
772     tp = t + (*vj)*5;
773     while (nz--) {
774       /* xk += U(k,:)*x(:) */
775       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
776       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
777       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
778       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
779       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
780       vj++;
781       tp = t + (*vj)*5;
782       v += 25;
783     }
784     tp       = t + k*5;
785     tp[0]    = x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
786     idx      = 5*r[k];
787     x[idx]   = x0;
788     x[idx+1] = x1;
789     x[idx+2] = x2;
790     x[idx+3] = x3;
791     x[idx+4] = x4;
792   }
793 
794   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
795   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
796   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
797   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
798   PetscFunctionReturn(0);
799 }
800 
801 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
802 {
803   const MatScalar *v,*diag;
804   PetscScalar     *xp,x0,x1,x2,x3,x4;
805   PetscInt        nz,k;
806   const PetscInt  *vj;
807 
808   PetscFunctionBegin;
809   for (k=0; k<mbs; k++) {
810     v  = aa + 25*ai[k];
811     xp = x + k*5;
812     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
813     nz = ai[k+1] - ai[k];
814     vj = aj + ai[k];
815     xp = x + (*vj)*5;
816     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
817     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
818     while (nz--) {
819       /* x(:) += U(k,:)^T*(Dk*xk) */
820       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
821       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
822       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
823       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
824       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
825       vj++;
826       xp = x + (*vj)*5;
827       v += 25;
828     }
829     /* xk = inv(Dk)*(Dk*xk) */
830     diag  = aa+k*25;         /* ptr to inv(Dk) */
831     xp    = x + k*5;
832     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
833     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
834     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
835     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
836     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
837   }
838   PetscFunctionReturn(0);
839 }
840 
841 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
842 {
843   const MatScalar *v;
844   PetscScalar     *xp,x0,x1,x2,x3,x4;
845   PetscInt        nz,k;
846   const PetscInt  *vj;
847 
848   PetscFunctionBegin;
849   for (k=mbs-1; k>=0; k--) {
850     v  = aa + 25*ai[k];
851     xp = x + k*5;
852     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; /* xk */
853     nz = ai[k+1] - ai[k];
854     vj = aj + ai[k];
855     xp = x + (*vj)*5;
856     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
857     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
858     while (nz--) {
859       /* xk += U(k,:)*x(:) */
860       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
861       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
862       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
863       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
864       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
865       vj++;
866       v += 25; xp = x + (*vj)*5;
867     }
868     xp   = x + k*5;
869     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
870   }
871   PetscFunctionReturn(0);
872 }
873 
874 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
875 {
876   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
877   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
878   const MatScalar   *aa=a->a;
879   PetscScalar       *x;
880   const PetscScalar *b;
881   PetscErrorCode    ierr;
882 
883   PetscFunctionBegin;
884   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
885   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
886 
887   /* solve U^T * D * y = b by forward substitution */
888   ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */
889   ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
890 
891   /* solve U*x = y by back substitution */
892   ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
893 
894   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
895   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
896   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
897   PetscFunctionReturn(0);
898 }
899 
900 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
901 {
902   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
903   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
904   const MatScalar   *aa=a->a;
905   PetscScalar       *x;
906   const PetscScalar *b;
907   PetscErrorCode    ierr;
908 
909   PetscFunctionBegin;
910   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
911   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
912   ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr); /* x <- b */
913   ierr = MatForwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
914   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
915   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
916   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
917   PetscFunctionReturn(0);
918 }
919 
920 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
921 {
922   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
923   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
924   const MatScalar   *aa=a->a;
925   PetscScalar       *x;
926   const PetscScalar *b;
927   PetscErrorCode    ierr;
928 
929   PetscFunctionBegin;
930   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
931   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
932   ierr = PetscArraycpy(x,b,5*mbs);CHKERRQ(ierr);
933   ierr = MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
934   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
935   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
936   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
937   PetscFunctionReturn(0);
938 }
939 
940 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
941 {
942   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
943   IS                isrow=a->row;
944   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
945   PetscErrorCode    ierr;
946   const PetscInt    *r,*vj;
947   PetscInt          nz,k,idx;
948   const MatScalar   *aa=a->a,*v,*diag;
949   PetscScalar       *x,x0,x1,x2,x3,*t,*tp;
950   const PetscScalar *b;
951 
952   PetscFunctionBegin;
953   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
954   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
955   t    = a->solve_work;
956   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
957 
958   /* solve U^T * D * y = b by forward substitution */
959   tp = t;
960   for (k=0; k<mbs; k++) { /* t <- perm(b) */
961     idx   = 4*r[k];
962     tp[0] = b[idx];
963     tp[1] = b[idx+1];
964     tp[2] = b[idx+2];
965     tp[3] = b[idx+3];
966     tp   += 4;
967   }
968 
969   for (k=0; k<mbs; k++) {
970     v  = aa + 16*ai[k];
971     vj = aj + ai[k];
972     tp = t + k*4;
973     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
974     nz = ai[k+1] - ai[k];
975 
976     tp = t + (*vj)*4;
977     while (nz--) {
978       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
979       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
980       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
981       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
982       vj++;
983       tp = t + (*vj)*4;
984       v += 16;
985     }
986 
987     /* xk = inv(Dk)*(Dk*xk) */
988     diag  = aa+k*16;          /* ptr to inv(Dk) */
989     tp    = t + k*4;
990     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
991     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
992     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
993     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
994   }
995 
996   /* solve U*x = y by back substitution */
997   for (k=mbs-1; k>=0; k--) {
998     v  = aa + 16*ai[k];
999     vj = aj + ai[k];
1000     tp = t + k*4;
1001     x0 = tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1002     nz = ai[k+1] - ai[k];
1003 
1004     tp = t + (*vj)*4;
1005     while (nz--) {
1006       /* xk += U(k,:)*x(:) */
1007       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1008       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1009       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1010       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1011       vj++;
1012       tp = t + (*vj)*4;
1013       v += 16;
1014     }
1015     tp       = t + k*4;
1016     tp[0]    =x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1017     idx      = 4*r[k];
1018     x[idx]   = x0;
1019     x[idx+1] = x1;
1020     x[idx+2] = x2;
1021     x[idx+3] = x3;
1022   }
1023 
1024   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1025   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1026   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1027   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1028   PetscFunctionReturn(0);
1029 }
1030 
1031 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1032 {
1033   const MatScalar *v,*diag;
1034   PetscScalar     *xp,x0,x1,x2,x3;
1035   PetscInt        nz,k;
1036   const PetscInt  *vj;
1037 
1038   PetscFunctionBegin;
1039   for (k=0; k<mbs; k++) {
1040     v  = aa + 16*ai[k];
1041     xp = x + k*4;
1042     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1043     nz = ai[k+1] - ai[k];
1044     vj = aj + ai[k];
1045     xp = x + (*vj)*4;
1046     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1047     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1048     while (nz--) {
1049       /* x(:) += U(k,:)^T*(Dk*xk) */
1050       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1051       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1052       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1053       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1054       vj++;
1055       xp = x + (*vj)*4;
1056       v += 16;
1057     }
1058     /* xk = inv(Dk)*(Dk*xk) */
1059     diag  = aa+k*16;         /* ptr to inv(Dk) */
1060     xp    = x + k*4;
1061     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1062     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1063     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1064     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1065   }
1066   PetscFunctionReturn(0);
1067 }
1068 
1069 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1070 {
1071   const MatScalar *v;
1072   PetscScalar     *xp,x0,x1,x2,x3;
1073   PetscInt        nz,k;
1074   const PetscInt  *vj;
1075 
1076   PetscFunctionBegin;
1077   for (k=mbs-1; k>=0; k--) {
1078     v  = aa + 16*ai[k];
1079     xp = x + k*4;
1080     x0 = xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1081     nz = ai[k+1] - ai[k];
1082     vj = aj + ai[k];
1083     xp = x + (*vj)*4;
1084     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1085     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1086     while (nz--) {
1087       /* xk += U(k,:)*x(:) */
1088       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1089       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1090       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1091       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1092       vj++;
1093       v += 16; xp = x + (*vj)*4;
1094     }
1095     xp    = x + k*4;
1096     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1097   }
1098   PetscFunctionReturn(0);
1099 }
1100 
1101 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1102 {
1103   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1104   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1105   const MatScalar   *aa=a->a;
1106   PetscScalar       *x;
1107   const PetscScalar *b;
1108   PetscErrorCode    ierr;
1109 
1110   PetscFunctionBegin;
1111   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1112   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1113 
1114   /* solve U^T * D * y = b by forward substitution */
1115   ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */
1116   ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1117 
1118   /* solve U*x = y by back substitution */
1119   ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1120   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1121   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1122   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1123   PetscFunctionReturn(0);
1124 }
1125 
1126 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1127 {
1128   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1129   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1130   const MatScalar   *aa=a->a;
1131   PetscScalar       *x;
1132   const PetscScalar *b;
1133   PetscErrorCode    ierr;
1134 
1135   PetscFunctionBegin;
1136   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1137   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1138   ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr); /* x <- b */
1139   ierr = MatForwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1140   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1141   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1142   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
1143   PetscFunctionReturn(0);
1144 }
1145 
1146 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1147 {
1148   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1149   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1150   const MatScalar   *aa=a->a;
1151   PetscScalar       *x;
1152   const PetscScalar *b;
1153   PetscErrorCode    ierr;
1154 
1155   PetscFunctionBegin;
1156   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1157   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1158   ierr = PetscArraycpy(x,b,4*mbs);CHKERRQ(ierr);
1159   ierr = MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1160   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1161   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1162   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
1163   PetscFunctionReturn(0);
1164 }
1165 
1166 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1167 {
1168   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1169   IS                isrow=a->row;
1170   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1171   PetscErrorCode    ierr;
1172   const PetscInt    *r;
1173   PetscInt          nz,k,idx;
1174   const PetscInt    *vj;
1175   const MatScalar   *aa=a->a,*v,*diag;
1176   PetscScalar       *x,x0,x1,x2,*t,*tp;
1177   const PetscScalar *b;
1178 
1179   PetscFunctionBegin;
1180   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1181   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1182   t    = a->solve_work;
1183   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1184 
1185   /* solve U^T * D * y = b by forward substitution */
1186   tp = t;
1187   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1188     idx   = 3*r[k];
1189     tp[0] = b[idx];
1190     tp[1] = b[idx+1];
1191     tp[2] = b[idx+2];
1192     tp   += 3;
1193   }
1194 
1195   for (k=0; k<mbs; k++) {
1196     v  = aa + 9*ai[k];
1197     vj = aj + ai[k];
1198     tp = t + k*3;
1199     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1200     nz = ai[k+1] - ai[k];
1201 
1202     tp = t + (*vj)*3;
1203     while (nz--) {
1204       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1205       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1206       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1207       vj++;
1208       tp = t + (*vj)*3;
1209       v += 9;
1210     }
1211 
1212     /* xk = inv(Dk)*(Dk*xk) */
1213     diag  = aa+k*9;          /* ptr to inv(Dk) */
1214     tp    = t + k*3;
1215     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1216     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1217     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1218   }
1219 
1220   /* solve U*x = y by back substitution */
1221   for (k=mbs-1; k>=0; k--) {
1222     v  = aa + 9*ai[k];
1223     vj = aj + ai[k];
1224     tp = t + k*3;
1225     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1226     nz = ai[k+1] - ai[k];
1227 
1228     tp = t + (*vj)*3;
1229     while (nz--) {
1230       /* xk += U(k,:)*x(:) */
1231       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1232       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1233       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1234       vj++;
1235       tp = t + (*vj)*3;
1236       v += 9;
1237     }
1238     tp       = t + k*3;
1239     tp[0]    = x0; tp[1] = x1; tp[2] = x2;
1240     idx      = 3*r[k];
1241     x[idx]   = x0;
1242     x[idx+1] = x1;
1243     x[idx+2] = x2;
1244   }
1245 
1246   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1247   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1248   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1249   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1250   PetscFunctionReturn(0);
1251 }
1252 
1253 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1254 {
1255   const MatScalar *v,*diag;
1256   PetscScalar     *xp,x0,x1,x2;
1257   PetscInt        nz,k;
1258   const PetscInt  *vj;
1259 
1260   PetscFunctionBegin;
1261   for (k=0; k<mbs; k++) {
1262     v  = aa + 9*ai[k];
1263     xp = x + k*3;
1264     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1265     nz = ai[k+1] - ai[k];
1266     vj = aj + ai[k];
1267     xp = x + (*vj)*3;
1268     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1269     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1270     while (nz--) {
1271       /* x(:) += U(k,:)^T*(Dk*xk) */
1272       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1273       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1274       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1275       vj++;
1276       xp = x + (*vj)*3;
1277       v += 9;
1278     }
1279     /* xk = inv(Dk)*(Dk*xk) */
1280     diag  = aa+k*9;         /* ptr to inv(Dk) */
1281     xp    = x + k*3;
1282     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1283     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1284     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1285   }
1286   PetscFunctionReturn(0);
1287 }
1288 
1289 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1290 {
1291   const MatScalar *v;
1292   PetscScalar     *xp,x0,x1,x2;
1293   PetscInt        nz,k;
1294   const PetscInt  *vj;
1295 
1296   PetscFunctionBegin;
1297   for (k=mbs-1; k>=0; k--) {
1298     v  = aa + 9*ai[k];
1299     xp = x + k*3;
1300     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1301     nz = ai[k+1] - ai[k];
1302     vj = aj + ai[k];
1303     xp = x + (*vj)*3;
1304     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1305     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1306     while (nz--) {
1307       /* xk += U(k,:)*x(:) */
1308       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1309       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1310       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1311       vj++;
1312       v += 9; xp = x + (*vj)*3;
1313     }
1314     xp    = x + k*3;
1315     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1316   }
1317   PetscFunctionReturn(0);
1318 }
1319 
1320 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1321 {
1322   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1323   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1324   const MatScalar   *aa=a->a;
1325   PetscScalar       *x;
1326   const PetscScalar *b;
1327   PetscErrorCode    ierr;
1328 
1329   PetscFunctionBegin;
1330   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1331   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1332 
1333   /* solve U^T * D * y = b by forward substitution */
1334   ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr);
1335   ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1336 
1337   /* solve U*x = y by back substitution */
1338   ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1339 
1340   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1341   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1342   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1343   PetscFunctionReturn(0);
1344 }
1345 
1346 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1347 {
1348   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1349   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1350   const MatScalar   *aa=a->a;
1351   PetscScalar       *x;
1352   const PetscScalar *b;
1353   PetscErrorCode    ierr;
1354 
1355   PetscFunctionBegin;
1356   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1357   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1358   ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr);
1359   ierr = MatForwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1360   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1361   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1362   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
1363   PetscFunctionReturn(0);
1364 }
1365 
1366 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1367 {
1368   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1369   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1370   const MatScalar   *aa=a->a;
1371   PetscScalar       *x;
1372   const PetscScalar *b;
1373   PetscErrorCode    ierr;
1374 
1375   PetscFunctionBegin;
1376   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1377   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1378   ierr = PetscArraycpy(x,b,3*mbs);CHKERRQ(ierr);
1379   ierr = MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1380   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1381   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1382   ierr = PetscLogFlops(2.0*a->bs2*(a->nz-mbs));CHKERRQ(ierr);
1383   PetscFunctionReturn(0);
1384 }
1385 
1386 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1387 {
1388   Mat_SeqSBAIJ      *a   =(Mat_SeqSBAIJ*)A->data;
1389   IS                isrow=a->row;
1390   const PetscInt    mbs  =a->mbs,*ai=a->i,*aj=a->j;
1391   PetscErrorCode    ierr;
1392   const PetscInt    *r,*vj;
1393   PetscInt          nz,k,k2,idx;
1394   const MatScalar   *aa=a->a,*v,*diag;
1395   PetscScalar       *x,x0,x1,*t;
1396   const PetscScalar *b;
1397 
1398   PetscFunctionBegin;
1399   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1400   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1401   t    = a->solve_work;
1402   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1403 
1404   /* solve U^T * D * y = perm(b) by forward substitution */
1405   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1406     idx      = 2*r[k];
1407     t[k*2]   = b[idx];
1408     t[k*2+1] = b[idx+1];
1409   }
1410   for (k=0; k<mbs; k++) {
1411     v  = aa + 4*ai[k];
1412     vj = aj + ai[k];
1413     k2 = k*2;
1414     x0 = t[k2]; x1 = t[k2+1];
1415     nz = ai[k+1] - ai[k];
1416     while (nz--) {
1417       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1418       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1419       vj++; v      += 4;
1420     }
1421     diag    = aa+k*4; /* ptr to inv(Dk) */
1422     t[k2]   = diag[0]*x0 + diag[2]*x1;
1423     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1424   }
1425 
1426   /* solve U*x = y by back substitution */
1427   for (k=mbs-1; k>=0; k--) {
1428     v  = aa + 4*ai[k];
1429     vj = aj + ai[k];
1430     k2 = k*2;
1431     x0 = t[k2]; x1 = t[k2+1];
1432     nz = ai[k+1] - ai[k];
1433     while (nz--) {
1434       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1435       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1436       vj++;
1437       v += 4;
1438     }
1439     t[k2]    = x0;
1440     t[k2+1]  = x1;
1441     idx      = 2*r[k];
1442     x[idx]   = x0;
1443     x[idx+1] = x1;
1444   }
1445 
1446   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1447   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1448   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1449   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1450   PetscFunctionReturn(0);
1451 }
1452 
1453 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1454 {
1455   const MatScalar *v,*diag;
1456   PetscScalar     x0,x1;
1457   PetscInt        nz,k,k2;
1458   const PetscInt  *vj;
1459 
1460   PetscFunctionBegin;
1461   for (k=0; k<mbs; k++) {
1462     v  = aa + 4*ai[k];
1463     vj = aj + ai[k];
1464     k2 = k*2;
1465     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1466     nz = ai[k+1] - ai[k];
1467     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1468     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1469     while (nz--) {
1470       /* x(:) += U(k,:)^T*(Dk*xk) */
1471       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1472       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1473       vj++; v      += 4;
1474     }
1475     /* xk = inv(Dk)*(Dk*xk) */
1476     diag    = aa+k*4;       /* ptr to inv(Dk) */
1477     x[k2]   = diag[0]*x0 + diag[2]*x1;
1478     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1479   }
1480   PetscFunctionReturn(0);
1481 }
1482 
1483 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(const PetscInt *ai,const PetscInt *aj,const MatScalar *aa,PetscInt mbs,PetscScalar *x)
1484 {
1485   const MatScalar *v;
1486   PetscScalar     x0,x1;
1487   PetscInt        nz,k,k2;
1488   const PetscInt  *vj;
1489 
1490   PetscFunctionBegin;
1491   for (k=mbs-1; k>=0; k--) {
1492     v  = aa + 4*ai[k];
1493     vj = aj + ai[k];
1494     k2 = k*2;
1495     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1496     nz = ai[k+1] - ai[k];
1497     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1498     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1499     while (nz--) {
1500       /* xk += U(k,:)*x(:) */
1501       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1502       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1503       vj++;
1504       v += 4;
1505     }
1506     x[k2]   = x0;
1507     x[k2+1] = x1;
1508   }
1509   PetscFunctionReturn(0);
1510 }
1511 
1512 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1513 {
1514   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1515   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1516   const MatScalar   *aa=a->a;
1517   PetscScalar       *x;
1518   const PetscScalar *b;
1519   PetscErrorCode    ierr;
1520 
1521   PetscFunctionBegin;
1522   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1523   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1524 
1525   /* solve U^T * D * y = b by forward substitution */
1526   ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr);
1527   ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1528 
1529   /* solve U*x = y by back substitution */
1530   ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1531 
1532   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1533   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1534   ierr = PetscLogFlops(4.0*a->bs2*a->nz - (A->rmap->bs+2.0*a->bs2)*mbs);CHKERRQ(ierr);
1535   PetscFunctionReturn(0);
1536 }
1537 
1538 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1539 {
1540   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1541   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1542   const MatScalar   *aa=a->a;
1543   PetscScalar       *x;
1544   const PetscScalar *b;
1545   PetscErrorCode    ierr;
1546 
1547   PetscFunctionBegin;
1548   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1549   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1550   ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr);
1551   ierr = MatForwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1552   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1553   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1554   ierr = PetscLogFlops(2.0*a->bs2*a->nz - A->rmap->bs*mbs);CHKERRQ(ierr);
1555   PetscFunctionReturn(0);
1556 }
1557 
1558 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1559 {
1560   Mat_SeqSBAIJ      *a =(Mat_SeqSBAIJ*)A->data;
1561   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j;
1562   const MatScalar   *aa=a->a;
1563   PetscScalar       *x;
1564   const PetscScalar *b;
1565   PetscErrorCode    ierr;
1566 
1567   PetscFunctionBegin;
1568   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1569   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1570   ierr = PetscArraycpy(x,b,2*mbs);CHKERRQ(ierr);
1571   ierr = MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1572   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1573   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1574   ierr = PetscLogFlops(2.0*a->bs2*(a->nz - mbs));CHKERRQ(ierr);
1575   PetscFunctionReturn(0);
1576 }
1577 
1578 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1579 {
1580   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1581   IS                isrow=a->row;
1582   PetscErrorCode    ierr;
1583   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1584   const MatScalar   *aa=a->a,*v;
1585   const PetscScalar *b;
1586   PetscScalar       *x,xk,*t;
1587   PetscInt          nz,k,j;
1588 
1589   PetscFunctionBegin;
1590   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1591   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1592   t    = a->solve_work;
1593   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1594 
1595   /* solve U^T*D*y = perm(b) by forward substitution */
1596   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1597   for (k=0; k<mbs; k++) {
1598     v  = aa + ai[k];
1599     vj = aj + ai[k];
1600     xk = t[k];
1601     nz = ai[k+1] - ai[k] - 1;
1602     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1603     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1604   }
1605 
1606   /* solve U*perm(x) = y by back substitution */
1607   for (k=mbs-1; k>=0; k--) {
1608     v  = aa + adiag[k] - 1;
1609     vj = aj + adiag[k] - 1;
1610     nz = ai[k+1] - ai[k] - 1;
1611     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1612     x[rp[k]] = t[k];
1613   }
1614 
1615   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1616   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1617   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1618   ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr);
1619   PetscFunctionReturn(0);
1620 }
1621 
1622 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1623 {
1624   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1625   IS                isrow=a->row;
1626   PetscErrorCode    ierr;
1627   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1628   const MatScalar   *aa=a->a,*v;
1629   PetscScalar       *x,xk,*t;
1630   const PetscScalar *b;
1631   PetscInt          nz,k;
1632 
1633   PetscFunctionBegin;
1634   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1635   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1636   t    = a->solve_work;
1637   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1638 
1639   /* solve U^T*D*y = perm(b) by forward substitution */
1640   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1641   for (k=0; k<mbs; k++) {
1642     v  = aa + ai[k] + 1;
1643     vj = aj + ai[k] + 1;
1644     xk = t[k];
1645     nz = ai[k+1] - ai[k] - 1;
1646     while (nz--) t[*vj++] += (*v++) * xk;
1647     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1648   }
1649 
1650   /* solve U*perm(x) = y by back substitution */
1651   for (k=mbs-1; k>=0; k--) {
1652     v  = aa + ai[k] + 1;
1653     vj = aj + ai[k] + 1;
1654     nz = ai[k+1] - ai[k] - 1;
1655     while (nz--) t[k] += (*v++) * t[*vj++];
1656     x[rp[k]] = t[k];
1657   }
1658 
1659   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1660   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1661   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1662   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
1663   PetscFunctionReturn(0);
1664 }
1665 
1666 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1667 {
1668   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1669   IS                isrow=a->row;
1670   PetscErrorCode    ierr;
1671   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1672   const MatScalar   *aa=a->a,*v;
1673   PetscReal         diagk;
1674   PetscScalar       *x,xk;
1675   const PetscScalar *b;
1676   PetscInt          nz,k;
1677 
1678   PetscFunctionBegin;
1679   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1680   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1681   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1682   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1683 
1684   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1685   for (k=0; k<mbs; k++) {
1686     v  = aa + ai[k];
1687     vj = aj + ai[k];
1688     xk = x[k];
1689     nz = ai[k+1] - ai[k] - 1;
1690     while (nz--) x[*vj++] += (*v++) * xk;
1691 
1692     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1693     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1694     x[k] = xk*PetscSqrtReal(diagk);
1695   }
1696   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1697   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1698   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1699   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1700   PetscFunctionReturn(0);
1701 }
1702 
1703 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1704 {
1705   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1706   IS                isrow=a->row;
1707   PetscErrorCode    ierr;
1708   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1709   const MatScalar   *aa=a->a,*v;
1710   PetscReal         diagk;
1711   PetscScalar       *x,xk;
1712   const PetscScalar *b;
1713   PetscInt          nz,k;
1714 
1715   PetscFunctionBegin;
1716   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1717   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1718   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1719   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1720 
1721   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1722   for (k=0; k<mbs; k++) {
1723     v  = aa + ai[k] + 1;
1724     vj = aj + ai[k] + 1;
1725     xk = x[k];
1726     nz = ai[k+1] - ai[k] - 1;
1727     while (nz--) x[*vj++] += (*v++) * xk;
1728 
1729     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1730     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1731     x[k] = xk*PetscSqrtReal(diagk);
1732   }
1733   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1734   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1735   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1736   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1737   PetscFunctionReturn(0);
1738 }
1739 
1740 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1741 {
1742   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1743   IS                isrow=a->row;
1744   PetscErrorCode    ierr;
1745   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag=a->diag;
1746   const MatScalar   *aa=a->a,*v;
1747   PetscReal         diagk;
1748   PetscScalar       *x,*t;
1749   const PetscScalar *b;
1750   PetscInt          nz,k;
1751 
1752   PetscFunctionBegin;
1753   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1754   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1755   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1756   t    = a->solve_work;
1757   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1758 
1759   for (k=mbs-1; k>=0; k--) {
1760     v     = aa + ai[k];
1761     vj    = aj + ai[k];
1762     diagk = PetscRealPart(aa[adiag[k]]);
1763     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1764     t[k] = b[k] * PetscSqrtReal(diagk);
1765     nz   = ai[k+1] - ai[k] - 1;
1766     while (nz--) t[k] += (*v++) * t[*vj++];
1767     x[rp[k]] = t[k];
1768   }
1769   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1770   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1771   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1772   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1773   PetscFunctionReturn(0);
1774 }
1775 
1776 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1777 {
1778   Mat_SeqSBAIJ      *a   = (Mat_SeqSBAIJ*)A->data;
1779   IS                isrow=a->row;
1780   PetscErrorCode    ierr;
1781   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1782   const MatScalar   *aa=a->a,*v;
1783   PetscReal         diagk;
1784   PetscScalar       *x,*t;
1785   const PetscScalar *b;
1786   PetscInt          nz,k;
1787 
1788   PetscFunctionBegin;
1789   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1790   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1791   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1792   t    = a->solve_work;
1793   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1794 
1795   for (k=mbs-1; k>=0; k--) {
1796     v     = aa + ai[k] + 1;
1797     vj    = aj + ai[k] + 1;
1798     diagk = PetscRealPart(aa[ai[k]]);
1799     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1800     t[k] = b[k] * PetscSqrtReal(diagk);
1801     nz   = ai[k+1] - ai[k] - 1;
1802     while (nz--) t[k] += (*v++) * t[*vj++];
1803     x[rp[k]] = t[k];
1804   }
1805   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1806   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1807   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1808   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1809   PetscFunctionReturn(0);
1810 }
1811 
1812 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1813 {
1814   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1815   PetscErrorCode ierr;
1816 
1817   PetscFunctionBegin;
1818   if (A->rmap->bs == 1) {
1819     ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr);
1820   } else {
1821     IS                isrow=a->row;
1822     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1823     const MatScalar   *aa=a->a,*v;
1824     PetscScalar       *x,*t;
1825     const PetscScalar *b;
1826     PetscInt          nz,k,n,i,j;
1827 
1828     if (bb->n > a->solves_work_n) {
1829       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1830       ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr);
1831       a->solves_work_n = bb->n;
1832     }
1833     n    = bb->n;
1834     ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr);
1835     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1836     t    = a->solves_work;
1837 
1838     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1839 
1840     /* solve U^T*D*y = perm(b) by forward substitution */
1841     for (k=0; k<mbs; k++) {
1842       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs]; /* values are stored interlaced in t */
1843     }
1844     for (k=0; k<mbs; k++) {
1845       v  = aa + ai[k];
1846       vj = aj + ai[k];
1847       nz = ai[k+1] - ai[k] - 1;
1848       for (j=0; j<nz; j++) {
1849         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1850         v++;vj++;
1851       }
1852       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1853     }
1854 
1855     /* solve U*perm(x) = y by back substitution */
1856     for (k=mbs-1; k>=0; k--) {
1857       v  = aa + ai[k] - 1;
1858       vj = aj + ai[k] - 1;
1859       nz = ai[k+1] - ai[k] - 1;
1860       for (j=0; j<nz; j++) {
1861         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1862         v++;vj++;
1863       }
1864       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1865     }
1866 
1867     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1868     ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr);
1869     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1870     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1871   }
1872   PetscFunctionReturn(0);
1873 }
1874 
1875 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1876 {
1877   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data;
1878   PetscErrorCode ierr;
1879 
1880   PetscFunctionBegin;
1881   if (A->rmap->bs == 1) {
1882     ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr);
1883   } else {
1884     IS                isrow=a->row;
1885     const PetscInt    *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1886     const MatScalar   *aa=a->a,*v;
1887     PetscScalar       *x,*t;
1888     const PetscScalar *b;
1889     PetscInt          nz,k,n,i;
1890 
1891     if (bb->n > a->solves_work_n) {
1892       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1893       ierr = PetscMalloc1(bb->n*A->rmap->N,&a->solves_work);CHKERRQ(ierr);
1894       a->solves_work_n = bb->n;
1895     }
1896     n    = bb->n;
1897     ierr = VecGetArrayRead(bb->v,&b);CHKERRQ(ierr);
1898     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1899     t    = a->solves_work;
1900 
1901     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1902 
1903     /* solve U^T*D*y = perm(b) by forward substitution */
1904     for (k=0; k<mbs; k++) {
1905       for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];  /* values are stored interlaced in t */
1906     }
1907     for (k=0; k<mbs; k++) {
1908       v  = aa + ai[k];
1909       vj = aj + ai[k];
1910       nz = ai[k+1] - ai[k];
1911       while (nz--) {
1912         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1913         v++;vj++;
1914       }
1915       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1916     }
1917 
1918     /* solve U*perm(x) = y by back substitution */
1919     for (k=mbs-1; k>=0; k--) {
1920       v  = aa + ai[k];
1921       vj = aj + ai[k];
1922       nz = ai[k+1] - ai[k];
1923       while (nz--) {
1924         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1925         v++;vj++;
1926       }
1927       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1928     }
1929 
1930     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1931     ierr = VecRestoreArrayRead(bb->v,&b);CHKERRQ(ierr);
1932     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1933     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1934   }
1935   PetscFunctionReturn(0);
1936 }
1937 
1938 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1939 {
1940   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1941   PetscErrorCode    ierr;
1942   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1943   const MatScalar   *aa=a->a,*v;
1944   const PetscScalar *b;
1945   PetscScalar       *x,xi;
1946   PetscInt          nz,i,j;
1947 
1948   PetscFunctionBegin;
1949   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1950   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1951   /* solve U^T*D*y = b by forward substitution */
1952   ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
1953   for (i=0; i<mbs; i++) {
1954     v  = aa + ai[i];
1955     vj = aj + ai[i];
1956     xi = x[i];
1957     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1958     for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
1959     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1960   }
1961   /* solve U*x = y by backward substitution */
1962   for (i=mbs-2; i>=0; i--) {
1963     xi = x[i];
1964     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1965     vj = aj + adiag[i] - 1;
1966     nz = ai[i+1] - ai[i] - 1;
1967     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1968     x[i] = xi;
1969   }
1970   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1971   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1972   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
1973   PetscFunctionReturn(0);
1974 }
1975 
1976 PetscErrorCode MatMatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Mat B,Mat X)
1977 {
1978   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
1979   PetscErrorCode    ierr;
1980   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag=a->diag;
1981   const MatScalar   *aa=a->a,*v;
1982   const PetscScalar *b;
1983   PetscScalar       *x,xi;
1984   PetscInt          nz,i,j,neq,ldb,ldx;
1985   PetscBool         isdense;
1986 
1987   PetscFunctionBegin;
1988   if (!mbs) PetscFunctionReturn(0);
1989   ierr = PetscObjectTypeCompare((PetscObject)B,MATSEQDENSE,&isdense);CHKERRQ(ierr);
1990   if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"B matrix must be a SeqDense matrix");
1991   if (X != B) {
1992     ierr = PetscObjectTypeCompare((PetscObject)X,MATSEQDENSE,&isdense);CHKERRQ(ierr);
1993     if (!isdense) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_INCOMP,"X matrix must be a SeqDense matrix");
1994   }
1995   ierr = MatDenseGetArrayRead(B,&b);CHKERRQ(ierr);
1996   ierr = MatDenseGetLDA(B,&ldb);CHKERRQ(ierr);
1997   ierr = MatDenseGetArray(X,&x);CHKERRQ(ierr);
1998   ierr = MatDenseGetLDA(X,&ldx);CHKERRQ(ierr);
1999   for (neq=0; neq<B->cmap->n; neq++) {
2000     /* solve U^T*D*y = b by forward substitution */
2001     ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
2002     for (i=0; i<mbs; i++) {
2003       v  = aa + ai[i];
2004       vj = aj + ai[i];
2005       xi = x[i];
2006       nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
2007       for (j=0; j<nz; j++) x[vj[j]] += v[j]*xi;
2008       x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2009     }
2010     /* solve U*x = y by backward substitution */
2011     for (i=mbs-2; i>=0; i--) {
2012       xi = x[i];
2013       v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2014       vj = aj + adiag[i] - 1;
2015       nz = ai[i+1] - ai[i] - 1;
2016       for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2017       x[i] = xi;
2018     }
2019     b += ldb;
2020     x += ldx;
2021   }
2022   ierr = MatDenseRestoreArrayRead(B,&b);CHKERRQ(ierr);
2023   ierr = MatDenseRestoreArray(X,&x);CHKERRQ(ierr);
2024   ierr = PetscLogFlops(B->cmap->n*(4.0*a->nz - 3*mbs));CHKERRQ(ierr);
2025   PetscFunctionReturn(0);
2026 }
2027 
2028 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2029 {
2030   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2031   PetscErrorCode    ierr;
2032   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2033   const MatScalar   *aa=a->a,*v;
2034   PetscScalar       *x,xk;
2035   const PetscScalar *b;
2036   PetscInt          nz,k;
2037 
2038   PetscFunctionBegin;
2039   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2040   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2041 
2042   /* solve U^T*D*y = b by forward substitution */
2043   ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
2044   for (k=0; k<mbs; k++) {
2045     v  = aa + ai[k] + 1;
2046     vj = aj + ai[k] + 1;
2047     xk = x[k];
2048     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2049     while (nz--) x[*vj++] += (*v++) * xk;
2050     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2051   }
2052 
2053   /* solve U*x = y by back substitution */
2054   for (k=mbs-2; k>=0; k--) {
2055     v  = aa + ai[k] + 1;
2056     vj = aj + ai[k] + 1;
2057     xk = x[k];
2058     nz = ai[k+1] - ai[k] - 1;
2059     while (nz--) {
2060       xk += (*v++) * x[*vj++];
2061     }
2062     x[k] = xk;
2063   }
2064 
2065   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2066   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2067   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
2068   PetscFunctionReturn(0);
2069 }
2070 
2071 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2072 {
2073   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2074   PetscErrorCode    ierr;
2075   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2076   const MatScalar   *aa=a->a,*v;
2077   PetscReal         diagk;
2078   PetscScalar       *x;
2079   const PetscScalar *b;
2080   PetscInt          nz,k;
2081 
2082   PetscFunctionBegin;
2083   /* solve U^T*D^(1/2)*x = b by forward substitution */
2084   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2085   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2086   ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
2087   for (k=0; k<mbs; k++) {
2088     v  = aa + ai[k];
2089     vj = aj + ai[k];
2090     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2091     while (nz--) x[*vj++] += (*v++) * x[k];
2092     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2093     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2094     x[k] *= PetscSqrtReal(diagk);
2095   }
2096   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2097   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2098   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2099   PetscFunctionReturn(0);
2100 }
2101 
2102 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2103 {
2104   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2105   PetscErrorCode    ierr;
2106   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2107   const MatScalar   *aa=a->a,*v;
2108   PetscReal         diagk;
2109   PetscScalar       *x;
2110   const PetscScalar *b;
2111   PetscInt          nz,k;
2112 
2113   PetscFunctionBegin;
2114   /* solve U^T*D^(1/2)*x = b by forward substitution */
2115   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2116   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2117   ierr = PetscArraycpy(x,b,mbs);CHKERRQ(ierr);
2118   for (k=0; k<mbs; k++) {
2119     v  = aa + ai[k] + 1;
2120     vj = aj + ai[k] + 1;
2121     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2122     while (nz--) x[*vj++] += (*v++) * x[k];
2123     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2124     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2125     x[k] *= PetscSqrtReal(diagk);
2126   }
2127   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2128   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2129   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2130   PetscFunctionReturn(0);
2131 }
2132 
2133 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2134 {
2135   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2136   PetscErrorCode    ierr;
2137   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vj;
2138   const MatScalar   *aa=a->a,*v;
2139   PetscReal         diagk;
2140   PetscScalar       *x;
2141   const PetscScalar *b;
2142   PetscInt          nz,k;
2143 
2144   PetscFunctionBegin;
2145   /* solve D^(1/2)*U*x = b by back substitution */
2146   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2147   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2148 
2149   for (k=mbs-1; k>=0; k--) {
2150     v     = aa + ai[k];
2151     vj    = aj + ai[k];
2152     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2153     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2154     x[k] = PetscSqrtReal(diagk)*b[k];
2155     nz   = ai[k+1] - ai[k] - 1;
2156     while (nz--) x[k] += (*v++) * x[*vj++];
2157   }
2158   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2159   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2160   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2161   PetscFunctionReturn(0);
2162 }
2163 
2164 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2165 {
2166   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ*)A->data;
2167   PetscErrorCode    ierr;
2168   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj;
2169   const MatScalar   *aa=a->a,*v;
2170   PetscReal         diagk;
2171   PetscScalar       *x;
2172   const PetscScalar *b;
2173   PetscInt          nz,k;
2174 
2175   PetscFunctionBegin;
2176   /* solve D^(1/2)*U*x = b by back substitution */
2177   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2178   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2179 
2180   for (k=mbs-1; k>=0; k--) {
2181     v     = aa + ai[k] + 1;
2182     vj    = aj + ai[k] + 1;
2183     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2184     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2185     x[k] = PetscSqrtReal(diagk)*b[k];
2186     nz   = ai[k+1] - ai[k] - 1;
2187     while (nz--) x[k] += (*v++) * x[*vj++];
2188   }
2189   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2190   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2191   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2192   PetscFunctionReturn(0);
2193 }
2194 
2195 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2196 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2197 {
2198   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2199   PetscErrorCode ierr;
2200   const PetscInt *rip,mbs = a->mbs,*ai,*aj;
2201   PetscInt       *jutmp,bs = A->rmap->bs,i;
2202   PetscInt       m,reallocs = 0,*levtmp;
2203   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2204   PetscInt       incrlev,*lev,shift,prow,nz;
2205   PetscReal      f = info->fill,levels = info->levels;
2206   PetscBool      perm_identity;
2207 
2208   PetscFunctionBegin;
2209   /* check whether perm is the identity mapping */
2210   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2211 
2212   if (perm_identity) {
2213     a->permute = PETSC_FALSE;
2214     ai         = a->i; aj = a->j;
2215   } else { /*  non-trivial permutation */
2216     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2217   }
2218 
2219   /* initialization */
2220   ierr  = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2221   umax  = (PetscInt)(f*ai[mbs] + 1);
2222   ierr  = PetscMalloc1(umax,&lev);CHKERRQ(ierr);
2223   umax += mbs + 1;
2224   shift = mbs + 1;
2225   ierr  = PetscMalloc1(mbs+1,&iu);CHKERRQ(ierr);
2226   ierr  = PetscMalloc1(umax,&ju);CHKERRQ(ierr);
2227   iu[0] = mbs + 1;
2228   juidx = mbs + 1;
2229   /* prowl: linked list for pivot row */
2230   ierr = PetscMalloc3(mbs,&prowl,mbs,&q,mbs,&levtmp);CHKERRQ(ierr);
2231   /* q: linked list for col index */
2232 
2233   for (i=0; i<mbs; i++) {
2234     prowl[i] = mbs;
2235     q[i]     = 0;
2236   }
2237 
2238   /* for each row k */
2239   for (k=0; k<mbs; k++) {
2240     nzk  = 0;
2241     q[k] = mbs;
2242     /* copy current row into linked list */
2243     nz = ai[rip[k]+1] - ai[rip[k]];
2244     j  = ai[rip[k]];
2245     while (nz--) {
2246       vj = rip[aj[j++]];
2247       if (vj > k) {
2248         qm = k;
2249         do {
2250           m = qm; qm = q[m];
2251         } while (qm < vj);
2252         if (qm == vj) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2253         nzk++;
2254         q[m]       = vj;
2255         q[vj]      = qm;
2256         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2257       }
2258     }
2259 
2260     /* modify nonzero structure of k-th row by computing fill-in
2261        for each row prow to be merged in */
2262     prow = k;
2263     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2264 
2265     while (prow < k) {
2266       /* merge row prow into k-th row */
2267       jmin = iu[prow] + 1;
2268       jmax = iu[prow+1];
2269       qm   = k;
2270       for (j=jmin; j<jmax; j++) {
2271         incrlev = lev[j-shift] + 1;
2272         if (incrlev > levels) continue;
2273         vj = ju[j];
2274         do {
2275           m = qm; qm = q[m];
2276         } while (qm < vj);
2277         if (qm != vj) {      /* a fill */
2278           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2279           levtmp[vj] = incrlev;
2280         } else if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2281       }
2282       prow = prowl[prow]; /* next pivot row */
2283     }
2284 
2285     /* add k to row list for first nonzero element in k-th row */
2286     if (nzk > 1) {
2287       i        = q[k]; /* col value of first nonzero element in k_th row of U */
2288       prowl[k] = prowl[i]; prowl[i] = k;
2289     }
2290     iu[k+1] = iu[k] + nzk;
2291 
2292     /* allocate more space to ju and lev if needed */
2293     if (iu[k+1] > umax) {
2294       /* estimate how much additional space we will need */
2295       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2296       /* just double the memory each time */
2297       maxadd = umax;
2298       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2299       umax += maxadd;
2300 
2301       /* allocate a longer ju */
2302       ierr = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr);
2303       ierr = PetscArraycpy(jutmp,ju,iu[k]);CHKERRQ(ierr);
2304       ierr = PetscFree(ju);CHKERRQ(ierr);
2305       ju   = jutmp;
2306 
2307       ierr      = PetscMalloc1(umax,&jutmp);CHKERRQ(ierr);
2308       ierr      = PetscArraycpy(jutmp,lev,iu[k]-shift);CHKERRQ(ierr);
2309       ierr      = PetscFree(lev);CHKERRQ(ierr);
2310       lev       = jutmp;
2311       reallocs += 2; /* count how many times we realloc */
2312     }
2313 
2314     /* save nonzero structure of k-th row in ju */
2315     i=k;
2316     while (nzk--) {
2317       i                = q[i];
2318       ju[juidx]        = i;
2319       lev[juidx-shift] = levtmp[i];
2320       juidx++;
2321     }
2322   }
2323 
2324 #if defined(PETSC_USE_INFO)
2325   if (ai[mbs] != 0) {
2326     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2327     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)f,(double)af);CHKERRQ(ierr);
2328     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2329     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g);\n",(double)af);CHKERRQ(ierr);
2330     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
2331   } else {
2332     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
2333   }
2334 #endif
2335 
2336   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2337   ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr);
2338   ierr = PetscFree(lev);CHKERRQ(ierr);
2339 
2340   /* put together the new matrix */
2341   ierr = MatSeqSBAIJSetPreallocation(B,bs,0,NULL);CHKERRQ(ierr);
2342 
2343   /* ierr = PetscLogObjectParent((PetscObject)B,(PetscObject)iperm);CHKERRQ(ierr); */
2344   b    = (Mat_SeqSBAIJ*)(B)->data;
2345   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2346 
2347   b->singlemalloc = PETSC_FALSE;
2348   b->free_a       = PETSC_TRUE;
2349   b->free_ij      = PETSC_TRUE;
2350   /* the next line frees the default space generated by the Create() */
2351   ierr    = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
2352   ierr    = PetscMalloc1((iu[mbs]+1)*a->bs2,&b->a);CHKERRQ(ierr);
2353   b->j    = ju;
2354   b->i    = iu;
2355   b->diag = NULL;
2356   b->ilen = NULL;
2357   b->imax = NULL;
2358 
2359   ierr    = ISDestroy(&b->row);CHKERRQ(ierr);
2360   ierr    = ISDestroy(&b->icol);CHKERRQ(ierr);
2361   b->row  = perm;
2362   b->icol = perm;
2363   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2364   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2365   ierr    = PetscMalloc1(bs*mbs+bs,&b->solve_work);CHKERRQ(ierr);
2366   /* In b structure:  Free imax, ilen, old a, old j.
2367      Allocate idnew, solve_work, new a, new j */
2368   ierr     = PetscLogObjectMemory((PetscObject)B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2369   b->maxnz = b->nz = iu[mbs];
2370 
2371   (B)->info.factor_mallocs   = reallocs;
2372   (B)->info.fill_ratio_given = f;
2373   if (ai[mbs] != 0) {
2374     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2375   } else {
2376     (B)->info.fill_ratio_needed = 0.0;
2377   }
2378   ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr);
2379   PetscFunctionReturn(0);
2380 }
2381 
2382 /*
2383   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2384 */
2385 #include <petscbt.h>
2386 #include <../src/mat/utils/freespace.h>
2387 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2388 {
2389   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2390   PetscErrorCode     ierr;
2391   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2392   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2393   const PetscInt     *rip;
2394   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2395   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2396   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2397   PetscReal          fill          =info->fill,levels=info->levels;
2398   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2399   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2400   PetscBT            lnkbt;
2401 
2402   PetscFunctionBegin;
2403   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2404   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
2405   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2406   if (bs > 1) {
2407     ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr);
2408     PetscFunctionReturn(0);
2409   }
2410 
2411   /* check whether perm is the identity mapping */
2412   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2413   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2414   a->permute = PETSC_FALSE;
2415 
2416   ierr  = PetscMalloc1(am+1,&ui);CHKERRQ(ierr);
2417   ierr  = PetscMalloc1(am+1,&udiag);CHKERRQ(ierr);
2418   ui[0] = 0;
2419 
2420   /* ICC(0) without matrix ordering: simply rearrange column indices */
2421   if (!levels) {
2422     /* reuse the column pointers and row offsets for memory savings */
2423     for (i=0; i<am; i++) {
2424       ncols    = ai[i+1] - ai[i];
2425       ui[i+1]  = ui[i] + ncols;
2426       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2427     }
2428     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2429     cols = uj;
2430     for (i=0; i<am; i++) {
2431       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2432       ncols = ai[i+1] - ai[i] -1;
2433       for (j=0; j<ncols; j++) *cols++ = aj[j];
2434       *cols++ = i; /* diagonal is located as the last entry of U(i,:) */
2435     }
2436   } else { /* case: levels>0 */
2437     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2438 
2439     /* initialization */
2440     /* jl: linked list for storing indices of the pivot rows
2441        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2442     ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr);
2443     for (i=0; i<am; i++) {
2444       jl[i] = am; il[i] = 0;
2445     }
2446 
2447     /* create and initialize a linked list for storing column indices of the active row k */
2448     nlnk = am + 1;
2449     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2450 
2451     /* initial FreeSpace size is fill*(ai[am]+1) */
2452     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr);
2453 
2454     current_space = free_space;
2455 
2456     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr);
2457 
2458     current_space_lvl = free_space_lvl;
2459 
2460     for (k=0; k<am; k++) {  /* for each active row k */
2461       /* initialize lnk by the column indices of row k */
2462       nzk   = 0;
2463       ncols = ai[k+1] - ai[k];
2464       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2465       cols = aj+ai[k];
2466       ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2467       nzk += nlnk;
2468 
2469       /* update lnk by computing fill-in for each pivot row to be merged in */
2470       prow = jl[k]; /* 1st pivot row */
2471 
2472       while (prow < k) {
2473         nextprow = jl[prow];
2474 
2475         /* merge prow into k-th row */
2476         jmin  = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2477         jmax  = ui[prow+1];
2478         ncols = jmax-jmin;
2479         i     = jmin - ui[prow];
2480 
2481         cols = uj_ptr[prow] + i;  /* points to the 2nd nzero entry in U(prow,k:am-1) */
2482         uj   = uj_lvl_ptr[prow] + i;  /* levels of cols */
2483         j    = *(uj - 1);
2484         ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2485         nzk += nlnk;
2486 
2487         /* update il and jl for prow */
2488         if (jmin < jmax) {
2489           il[prow] = jmin;
2490 
2491           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2492         }
2493         prow = nextprow;
2494       }
2495 
2496       /* if free space is not available, make more free space */
2497       if (current_space->local_remaining<nzk) {
2498         i    = am - k + 1; /* num of unfactored rows */
2499         i    = PetscIntMultTruncate(i,PetscMin(nzk, i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2500         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2501         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2502         reallocs++;
2503       }
2504 
2505       /* copy data into free_space and free_space_lvl, then initialize lnk */
2506       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2507       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2508 
2509       /* add the k-th row into il and jl */
2510       if (nzk > 1) {
2511         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2512         jl[k] = jl[i]; jl[i] = k;
2513         il[k] = ui[k] + 1;
2514       }
2515       uj_ptr[k]     = current_space->array;
2516       uj_lvl_ptr[k] = current_space_lvl->array;
2517 
2518       current_space->array               += nzk;
2519       current_space->local_used          += nzk;
2520       current_space->local_remaining     -= nzk;
2521       current_space_lvl->array           += nzk;
2522       current_space_lvl->local_used      += nzk;
2523       current_space_lvl->local_remaining -= nzk;
2524 
2525       ui[k+1] = ui[k] + nzk;
2526     }
2527 
2528     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2529     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2530 
2531     /* destroy list of free space and other temporary array(s) */
2532     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2533     ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor  */
2534     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2535     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2536 
2537   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2538 
2539   /* put together the new matrix in MATSEQSBAIJ format */
2540   ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
2541 
2542   b    = (Mat_SeqSBAIJ*)(fact)->data;
2543   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2544 
2545   b->singlemalloc = PETSC_FALSE;
2546   b->free_a       = PETSC_TRUE;
2547   b->free_ij      = free_ij;
2548 
2549   ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr);
2550 
2551   b->j         = uj;
2552   b->i         = ui;
2553   b->diag      = udiag;
2554   b->free_diag = PETSC_TRUE;
2555   b->ilen      = NULL;
2556   b->imax      = NULL;
2557   b->row       = perm;
2558   b->col       = perm;
2559 
2560   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2561   ierr = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2562 
2563   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2564 
2565   ierr = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr);
2566   ierr = PetscLogObjectMemory((PetscObject)fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2567 
2568   b->maxnz = b->nz = ui[am];
2569 
2570   fact->info.factor_mallocs   = reallocs;
2571   fact->info.fill_ratio_given = fill;
2572   if (ai[am] != 0) {
2573     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2574   } else {
2575     fact->info.fill_ratio_needed = 0.0;
2576   }
2577 #if defined(PETSC_USE_INFO)
2578   if (ai[am] != 0) {
2579     PetscReal af = fact->info.fill_ratio_needed;
2580     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
2581     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2582     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
2583   } else {
2584     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
2585   }
2586 #endif
2587   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2588   PetscFunctionReturn(0);
2589 }
2590 
2591 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2592 {
2593   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2594   Mat_SeqSBAIJ       *b;
2595   PetscErrorCode     ierr;
2596   PetscBool          perm_identity,free_ij = PETSC_TRUE;
2597   PetscInt           bs=A->rmap->bs,am=a->mbs;
2598   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2599   PetscInt           reallocs=0,i,*ui;
2600   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2601   PetscInt           nlnk,*lnk,*lnk_lvl=NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2602   PetscReal          fill          =info->fill,levels=info->levels,ratio_needed;
2603   PetscFreeSpaceList free_space    =NULL,current_space=NULL;
2604   PetscFreeSpaceList free_space_lvl=NULL,current_space_lvl=NULL;
2605   PetscBT            lnkbt;
2606 
2607   PetscFunctionBegin;
2608   /*
2609    This code originally uses Modified Sparse Row (MSR) storage
2610    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2611    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2612    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2613    thus the original code in MSR format is still used for these cases.
2614    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2615    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2616   */
2617   if (bs > 1) {
2618     ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
2619     PetscFunctionReturn(0);
2620   }
2621 
2622   /* check whether perm is the identity mapping */
2623   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2624   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2625   a->permute = PETSC_FALSE;
2626 
2627   /* special case that simply copies fill pattern */
2628   if (!levels) {
2629     /* reuse the column pointers and row offsets for memory savings */
2630     ui           = a->i;
2631     uj           = a->j;
2632     free_ij      = PETSC_FALSE;
2633     ratio_needed = 1.0;
2634   } else { /* case: levels>0 */
2635     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2636 
2637     /* initialization */
2638     ierr  = PetscMalloc1(am+1,&ui);CHKERRQ(ierr);
2639     ui[0] = 0;
2640 
2641     /* jl: linked list for storing indices of the pivot rows
2642        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2643     ierr = PetscMalloc4(am,&uj_ptr,am,&uj_lvl_ptr,am,&il,am,&jl);CHKERRQ(ierr);
2644     for (i=0; i<am; i++) {
2645       jl[i] = am; il[i] = 0;
2646     }
2647 
2648     /* create and initialize a linked list for storing column indices of the active row k */
2649     nlnk = am + 1;
2650     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2651 
2652     /* initial FreeSpace size is fill*(ai[am]+1) */
2653     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space);CHKERRQ(ierr);
2654 
2655     current_space = free_space;
2656 
2657     ierr = PetscFreeSpaceGet(PetscRealIntMultTruncate(fill,ai[am]+1),&free_space_lvl);CHKERRQ(ierr);
2658 
2659     current_space_lvl = free_space_lvl;
2660 
2661     for (k=0; k<am; k++) {  /* for each active row k */
2662       /* initialize lnk by the column indices of row rip[k] */
2663       nzk   = 0;
2664       ncols = ai[rip[k]+1] - ai[rip[k]];
2665       cols  = aj+ai[rip[k]];
2666       ierr  = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2667       nzk  += nlnk;
2668 
2669       /* update lnk by computing fill-in for each pivot row to be merged in */
2670       prow = jl[k]; /* 1st pivot row */
2671 
2672       while (prow < k) {
2673         nextprow = jl[prow];
2674 
2675         /* merge prow into k-th row */
2676         jmin     = il[prow] + 1; /* index of the 2nd nzero entry in U(prow,k:am-1) */
2677         jmax     = ui[prow+1];
2678         ncols    = jmax-jmin;
2679         i        = jmin - ui[prow];
2680         cols     = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2681         j        = *(uj_lvl_ptr[prow] + i - 1);
2682         cols_lvl = uj_lvl_ptr[prow]+i;
2683         ierr     = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2684         nzk     += nlnk;
2685 
2686         /* update il and jl for prow */
2687         if (jmin < jmax) {
2688           il[prow] = jmin;
2689           j        = *cols;
2690           jl[prow] = jl[j];
2691           jl[j]    = prow;
2692         }
2693         prow = nextprow;
2694       }
2695 
2696       /* if free space is not available, make more free space */
2697       if (current_space->local_remaining<nzk) {
2698         i    = am - k + 1; /* num of unfactored rows */
2699         i    = PetscMin(PetscIntMultTruncate(i,nzk), PetscIntMultTruncate(i,i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2700         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2701         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2702         reallocs++;
2703       }
2704 
2705       /* copy data into free_space and free_space_lvl, then initialize lnk */
2706       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2707 
2708       /* add the k-th row into il and jl */
2709       if (nzk-1 > 0) {
2710         i     = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2711         jl[k] = jl[i]; jl[i] = k;
2712         il[k] = ui[k] + 1;
2713       }
2714       uj_ptr[k]     = current_space->array;
2715       uj_lvl_ptr[k] = current_space_lvl->array;
2716 
2717       current_space->array               += nzk;
2718       current_space->local_used          += nzk;
2719       current_space->local_remaining     -= nzk;
2720       current_space_lvl->array           += nzk;
2721       current_space_lvl->local_used      += nzk;
2722       current_space_lvl->local_remaining -= nzk;
2723 
2724       ui[k+1] = ui[k] + nzk;
2725     }
2726 
2727     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2728     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2729 
2730     /* destroy list of free space and other temporary array(s) */
2731     ierr = PetscMalloc1(ui[am]+1,&uj);CHKERRQ(ierr);
2732     ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
2733     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2734     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2735     if (ai[am] != 0) {
2736       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2737     } else {
2738       ratio_needed = 0.0;
2739     }
2740   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2741 
2742   /* put together the new matrix in MATSEQSBAIJ format */
2743   ierr = MatSeqSBAIJSetPreallocation(fact,bs,MAT_SKIP_ALLOCATION,NULL);CHKERRQ(ierr);
2744 
2745   b = (Mat_SeqSBAIJ*)(fact)->data;
2746 
2747   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2748 
2749   b->singlemalloc = PETSC_FALSE;
2750   b->free_a       = PETSC_TRUE;
2751   b->free_ij      = free_ij;
2752 
2753   ierr = PetscMalloc1(ui[am]+1,&b->a);CHKERRQ(ierr);
2754 
2755   b->j             = uj;
2756   b->i             = ui;
2757   b->diag          = NULL;
2758   b->ilen          = NULL;
2759   b->imax          = NULL;
2760   b->row           = perm;
2761   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2762 
2763   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2764   b->icol = perm;
2765   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2766   ierr    = PetscMalloc1(am+1,&b->solve_work);CHKERRQ(ierr);
2767 
2768   b->maxnz = b->nz = ui[am];
2769 
2770   fact->info.factor_mallocs    = reallocs;
2771   fact->info.fill_ratio_given  = fill;
2772   fact->info.fill_ratio_needed = ratio_needed;
2773 #if defined(PETSC_USE_INFO)
2774   if (ai[am] != 0) {
2775     PetscReal af = fact->info.fill_ratio_needed;
2776     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %g needed %g\n",reallocs,(double)fill,(double)af);CHKERRQ(ierr);
2777     ierr = PetscInfo1(A,"Run with -pc_factor_fill %g or use \n",(double)af);CHKERRQ(ierr);
2778     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%g) for best performance.\n",(double)af);CHKERRQ(ierr);
2779   } else {
2780     ierr = PetscInfo(A,"Empty matrix\n");CHKERRQ(ierr);
2781   }
2782 #endif
2783   if (perm_identity) {
2784     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2785   } else {
2786     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2787   }
2788   PetscFunctionReturn(0);
2789 }
2790