xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision c07bf0741b2e8ac3bb29c3d024f5883d65444abc)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for SBAIJ format.
5 */
6 
7 #include "../src/mat/impls/sbaij/seq/sbaij.h"
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_inplace"
13 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
16   IS              isrow=a->row;
17   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j;
18   PetscErrorCode  ierr;
19   const PetscInt  *r;
20   PetscInt        nz,*vj,k,idx,k1;
21   PetscInt        bs=A->rmap->bs,bs2 = a->bs2;
22   MatScalar       *aa=a->a,*v,*diag;
23   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;
24 
25   PetscFunctionBegin;
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28   t  = a->solve_work;
29   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
30   ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr);
31 
32   /* solve U^T * D * y = b by forward substitution */
33   xk = t;
34   for (k=0; k<mbs; k++) { /* t <- perm(b) */
35     idx   = bs*r[k];
36     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
37   }
38   for (k=0; k<mbs; k++){
39     v  = aa + bs2*ai[k];
40     xk = t + k*bs;      /* Dk*xk = k-th block of x */
41     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
42     nz = ai[k+1] - ai[k];
43     vj = aj + ai[k];
44     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
45     while (nz--) {
46       /* x(:) += U(k,:)^T*(Dk*xk) */
47       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
48       vj++; xj = t + (*vj)*bs;
49       v += bs2;
50     }
51     /* xk = inv(Dk)*(Dk*xk) */
52     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
53     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
54   }
55 
56   /* solve U*x = y by back substitution */
57   for (k=mbs-1; k>=0; k--){
58     v  = aa + bs2*ai[k];
59     xk = t + k*bs;        /* xk */
60     nz = ai[k+1] - ai[k];
61     vj = aj + ai[k];
62     xj = t + (*vj)*bs;
63     while (nz--) {
64       /* xk += U(k,:)*x(:) */
65       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
66       vj++;
67       v += bs2; xj = t + (*vj)*bs;
68     }
69     idx   = bs*r[k];
70     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
71   }
72 
73   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
74   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
75   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
76   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
77   ierr = PetscLogFlops(bs2*(2.0*a->nz + mbs));CHKERRQ(ierr);
78   PetscFunctionReturn(0);
79 }
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_inplace"
83 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
84 {
85   PetscFunctionBegin;
86   SETERRQ(1,"not implemented yet");
87   PetscFunctionReturn(0);
88 }
89 
90 #undef __FUNCT__
91 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_inplace"
92 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
93 {
94   PetscFunctionBegin;
95   SETERRQ(1,"not implemented yet");
96   PetscFunctionReturn(0);
97 }
98 
99 #undef __FUNCT__
100 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private"
101 PetscErrorCode ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
102 {
103   PetscErrorCode ierr;
104   PetscInt       nz,*vj,k;
105   PetscInt       bs2 = bs*bs;
106   MatScalar      *v,*diag;
107   PetscScalar    *xk,*xj,*xk_tmp;
108 
109   PetscFunctionBegin;
110   ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr);
111   for (k=0; k<mbs; k++){
112     v  = aa + bs2*ai[k];
113     xk = x + k*bs;      /* Dk*xk = k-th block of x */
114     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
115     nz = ai[k+1] - ai[k];
116     vj = aj + ai[k];
117     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
118     while (nz--) {
119       /* x(:) += U(k,:)^T*(Dk*xk) */
120       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
121       vj++; xj = x + (*vj)*bs;
122       v += bs2;
123     }
124     /* xk = inv(Dk)*(Dk*xk) */
125     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
126     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
127   }
128   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
129   PetscFunctionReturn(0);
130 }
131 
132 #undef __FUNCT__
133 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private"
134 PetscErrorCode BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
135 {
136   PetscInt       nz,*vj,k;
137   PetscInt       bs2 = bs*bs;
138   MatScalar      *v;
139   PetscScalar    *xk,*xj;
140 
141   PetscFunctionBegin;
142   for (k=mbs-1; k>=0; k--){
143     v  = aa + bs2*ai[k];
144     xk = x + k*bs;        /* xk */
145     nz = ai[k+1] - ai[k];
146     vj = aj + ai[k];
147     xj = x + (*vj)*bs;
148     while (nz--) {
149       /* xk += U(k,:)*x(:) */
150       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
151       vj++;
152       v += bs2; xj = x + (*vj)*bs;
153     }
154   }
155   PetscFunctionReturn(0);
156 }
157 
158 #undef __FUNCT__
159 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
160 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
161 {
162   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
163   PetscErrorCode ierr;
164   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
165   PetscInt       bs=A->rmap->bs;
166   MatScalar      *aa=a->a;
167   PetscScalar    *x,*b;
168 #if defined(PETSC_USE_LOG)
169   PetscInt       bs2 = a->bs2;
170 #endif
171 
172   PetscFunctionBegin;
173   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
174   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
175 
176   /* solve U^T * D * y = b by forward substitution */
177   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
178   ierr = ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
179 
180   /* solve U*x = y by back substitution */
181   ierr = BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
182 
183   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
184   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
185   ierr = PetscLogFlops(bs2*(2.0*a->nz + mbs));CHKERRQ(ierr);
186   PetscFunctionReturn(0);
187 }
188 
189 #undef __FUNCT__
190 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
191 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
192 {
193   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
194   PetscErrorCode ierr;
195   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
196   PetscInt       bs=A->rmap->bs;
197   MatScalar      *aa=a->a;
198   PetscScalar    *x,*b;
199 #if defined(PETSC_USE_LOG)
200   PetscInt       bs2 = a->bs2;
201 #endif
202 
203   PetscFunctionBegin;
204   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
205   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
206   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
207   ierr = ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
208   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
209   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
210   ierr = PetscLogFlops(bs2*a->nz + A->rmap->N);CHKERRQ(ierr);
211   PetscFunctionReturn(0);
212 }
213 
214 #undef __FUNCT__
215 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
216 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
217 {
218   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
219   PetscErrorCode ierr;
220   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
221   PetscInt       bs=A->rmap->bs;
222   MatScalar      *aa=a->a;
223   PetscScalar    *x,*b;
224 #if defined(PETSC_USE_LOG)
225   PetscInt       bs2 = a->bs2;
226 #endif
227 
228   PetscFunctionBegin;
229   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
230   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
231   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
232   ierr = BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
233   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
234   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
235   ierr = PetscLogFlops(bs2*a->nz);CHKERRQ(ierr);
236   PetscFunctionReturn(0);
237 }
238 
239 #undef __FUNCT__
240 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_inplace"
241 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
242 {
243   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
244   IS             isrow=a->row;
245   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
246   PetscErrorCode ierr;
247   const PetscInt *r;
248   PetscInt       nz,*vj,k,idx;
249   MatScalar      *aa=a->a,*v,*d;
250   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
251 
252   PetscFunctionBegin;
253   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
254   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
255   t  = a->solve_work;
256   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
257 
258   /* solve U^T * D * y = b by forward substitution */
259   tp = t;
260   for (k=0; k<mbs; k++) { /* t <- perm(b) */
261     idx   = 7*r[k];
262     tp[0] = b[idx];
263     tp[1] = b[idx+1];
264     tp[2] = b[idx+2];
265     tp[3] = b[idx+3];
266     tp[4] = b[idx+4];
267     tp[5] = b[idx+5];
268     tp[6] = b[idx+6];
269     tp += 7;
270   }
271 
272   for (k=0; k<mbs; k++){
273     v  = aa + 49*ai[k];
274     vj = aj + ai[k];
275     tp = t + k*7;
276     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
277     nz = ai[k+1] - ai[k];
278     tp = t + (*vj)*7;
279     while (nz--) {
280       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
281       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
282       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
283       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
284       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
285       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
286       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
287       vj++; tp = t + (*vj)*7;
288       v += 49;
289     }
290 
291     /* xk = inv(Dk)*(Dk*xk) */
292     d  = aa+k*49;          /* ptr to inv(Dk) */
293     tp    = t + k*7;
294     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
295     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
296     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
297     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
298     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
299     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
300     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
301   }
302 
303   /* solve U*x = y by back substitution */
304   for (k=mbs-1; k>=0; k--){
305     v  = aa + 49*ai[k];
306     vj = aj + ai[k];
307     tp    = t + k*7;
308     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
309     nz = ai[k+1] - ai[k];
310 
311     tp = t + (*vj)*7;
312     while (nz--) {
313       /* xk += U(k,:)*x(:) */
314       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
315       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
316       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
317       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
318       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
319       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
320       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
321       vj++; tp = t + (*vj)*7;
322       v += 49;
323     }
324     tp    = t + k*7;
325     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
326     idx   = 7*r[k];
327     x[idx]     = x0;
328     x[idx+1]   = x1;
329     x[idx+2]   = x2;
330     x[idx+3]   = x3;
331     x[idx+4]   = x4;
332     x[idx+5]   = x5;
333     x[idx+6]   = x6;
334   }
335 
336   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
337   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
338   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
339   ierr = PetscLogFlops(49.0*(2.0*a->nz + mbs));CHKERRQ(ierr);
340   PetscFunctionReturn(0);
341 }
342 
343 #undef __FUNCT__
344 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private"
345 PetscErrorCode ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
346 {
347   MatScalar      *v,*d;
348   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
349   PetscInt       nz,*vj,k;
350 
351   PetscFunctionBegin;
352   for (k=0; k<mbs; k++){
353     v  = aa + 49*ai[k];
354     xp = x + k*7;
355     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
356     nz = ai[k+1] - ai[k];
357     vj = aj + ai[k];
358     xp = x + (*vj)*7;
359     while (nz--) {
360       /* x(:) += U(k,:)^T*(Dk*xk) */
361       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
362       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
363       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
364       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
365       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
366       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
367       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
368       vj++; xp = x + (*vj)*7;
369       v += 49;
370     }
371     /* xk = inv(Dk)*(Dk*xk) */
372     d  = aa+k*49;          /* ptr to inv(Dk) */
373     xp = x + k*7;
374     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
375     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
376     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
377     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
378     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
379     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
380     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
381   }
382   PetscFunctionReturn(0);
383 }
384 
385 #undef __FUNCT__
386 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private"
387 PetscErrorCode BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
388 {
389   MatScalar      *v;
390   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
391   PetscInt       nz,*vj,k;
392 
393   PetscFunctionBegin;
394   for (k=mbs-1; k>=0; k--){
395     v  = aa + 49*ai[k];
396     xp = x + k*7;
397     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
398     nz = ai[k+1] - ai[k];
399     vj = aj + ai[k];
400     xp = x + (*vj)*7;
401     while (nz--) {
402       /* xk += U(k,:)*x(:) */
403       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
404       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
405       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
406       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
407       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
408       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
409       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
410       vj++;
411       v += 49; xp = x + (*vj)*7;
412     }
413     xp = x + k*7;
414     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
415   }
416   PetscFunctionReturn(0);
417 }
418 
419 #undef __FUNCT__
420 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
421 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
422 {
423   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
424   PetscErrorCode ierr;
425   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
426   MatScalar      *aa=a->a;
427   PetscScalar    *x,*b;
428 
429   PetscFunctionBegin;
430   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
431   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
432 
433   /* solve U^T * D * y = b by forward substitution */
434   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
435   ierr = ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
436 
437   /* solve U*x = y by back substitution */
438   ierr = BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
439 
440   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
441   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
442   ierr = PetscLogFlops(49.0*(2*a->nz + mbs));CHKERRQ(ierr);
443   PetscFunctionReturn(0);
444 }
445 
446 #undef __FUNCT__
447 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
448 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
449 {
450   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
451   PetscErrorCode ierr;
452   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
453   MatScalar      *aa=a->a;
454   PetscScalar    *x,*b;
455 
456   PetscFunctionBegin;
457   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
458   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
459   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
460   ierr = ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
461   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
462   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
463   ierr = PetscLogFlops(49.0*a->nz + mbs);CHKERRQ(ierr);
464   PetscFunctionReturn(0);
465 }
466 
467 #undef __FUNCT__
468 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
469 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
470 {
471   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
472   PetscErrorCode ierr;
473   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
474   MatScalar      *aa=a->a;
475   PetscScalar    *x,*b;
476 
477   PetscFunctionBegin;
478   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
479   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
481   ierr = BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
482   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
483   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
484   ierr = PetscLogFlops(49.0*a->nz);CHKERRQ(ierr);
485   PetscFunctionReturn(0);
486 }
487 
488 #undef __FUNCT__
489 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_inplace"
490 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
491 {
492   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
493   IS             isrow=a->row;
494   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
495   PetscErrorCode ierr;
496   const PetscInt *r;
497   PetscInt       nz,*vj,k,idx;
498   MatScalar      *aa=a->a,*v,*d;
499   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
500 
501   PetscFunctionBegin;
502   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
503   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
504   t  = a->solve_work;
505   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
506 
507   /* solve U^T * D * y = b by forward substitution */
508   tp = t;
509   for (k=0; k<mbs; k++) { /* t <- perm(b) */
510     idx   = 6*r[k];
511     tp[0] = b[idx];
512     tp[1] = b[idx+1];
513     tp[2] = b[idx+2];
514     tp[3] = b[idx+3];
515     tp[4] = b[idx+4];
516     tp[5] = b[idx+5];
517     tp += 6;
518   }
519 
520   for (k=0; k<mbs; k++){
521     v  = aa + 36*ai[k];
522     vj = aj + ai[k];
523     tp = t + k*6;
524     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
525     nz = ai[k+1] - ai[k];
526     tp = t + (*vj)*6;
527     while (nz--) {
528       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
529       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
530       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
531       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
532       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
533       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
534       vj++; tp = t + (*vj)*6;
535       v += 36;
536     }
537 
538     /* xk = inv(Dk)*(Dk*xk) */
539     d  = aa+k*36;          /* ptr to inv(Dk) */
540     tp    = t + k*6;
541     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
542     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
543     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
544     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
545     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
546     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
547   }
548 
549   /* solve U*x = y by back substitution */
550   for (k=mbs-1; k>=0; k--){
551     v  = aa + 36*ai[k];
552     vj = aj + ai[k];
553     tp    = t + k*6;
554     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
555     nz = ai[k+1] - ai[k];
556 
557     tp = t + (*vj)*6;
558     while (nz--) {
559       /* xk += U(k,:)*x(:) */
560       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
561       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
562       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
563       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
564       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
565       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
566       vj++; tp = t + (*vj)*6;
567       v += 36;
568     }
569     tp    = t + k*6;
570     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
571     idx   = 6*r[k];
572     x[idx]     = x0;
573     x[idx+1]   = x1;
574     x[idx+2]   = x2;
575     x[idx+3]   = x3;
576     x[idx+4]   = x4;
577     x[idx+5]   = x5;
578   }
579 
580   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
581   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
582   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
583   ierr = PetscLogFlops(36.0*(2.0*a->nz + mbs));CHKERRQ(ierr);
584   PetscFunctionReturn(0);
585 }
586 
587 #undef __FUNCT__
588 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private"
589 PetscErrorCode ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
590 {
591   MatScalar      *v,*d;
592   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
593   PetscInt       nz,*vj,k;
594 
595   PetscFunctionBegin;
596   for (k=0; k<mbs; k++){
597     v  = aa + 36*ai[k];
598     xp = x + k*6;
599     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
600     nz = ai[k+1] - ai[k];
601     vj = aj + ai[k];
602     xp = x + (*vj)*6;
603     while (nz--) {
604       /* x(:) += U(k,:)^T*(Dk*xk) */
605       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
606       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
607       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
608       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
609       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
610       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
611       vj++; xp = x + (*vj)*6;
612       v += 36;
613     }
614     /* xk = inv(Dk)*(Dk*xk) */
615     d  = aa+k*36;          /* ptr to inv(Dk) */
616     xp = x + k*6;
617     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
618     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
619     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
620     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
621     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
622     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
623   }
624   PetscFunctionReturn(0);
625 }
626 #undef __FUNCT__
627 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private"
628 PetscErrorCode BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
629 {
630   MatScalar      *v;
631   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
632   PetscInt       nz,*vj,k;
633 
634   PetscFunctionBegin;
635   for (k=mbs-1; k>=0; k--){
636     v  = aa + 36*ai[k];
637     xp = x + k*6;
638     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
639     nz = ai[k+1] - ai[k];
640     vj = aj + ai[k];
641     xp = x + (*vj)*6;
642     while (nz--) {
643       /* xk += U(k,:)*x(:) */
644       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
645       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
646       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
647       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
648       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
649       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
650       vj++;
651       v += 36; xp = x + (*vj)*6;
652     }
653     xp = x + k*6;
654     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
655   }
656   PetscFunctionReturn(0);
657 }
658 
659 
660 #undef __FUNCT__
661 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
662 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
663 {
664   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
665   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
666   MatScalar      *aa=a->a;
667   PetscScalar    *x,*b;
668   PetscErrorCode ierr;
669 
670   PetscFunctionBegin;
671   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
672   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
673 
674   /* solve U^T * D * y = b by forward substitution */
675   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
676   ierr = ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
677 
678   /* solve U*x = y by back substitution */
679   ierr = BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
680 
681   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
682   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
683   ierr = PetscLogFlops(36.0*(2.0*a->nz + mbs));CHKERRQ(ierr);
684   PetscFunctionReturn(0);
685 }
686 
687 #undef __FUNCT__
688 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
689 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
690 {
691   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
692   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
693   MatScalar      *aa=a->a;
694   PetscScalar    *x,*b;
695   PetscErrorCode ierr;
696 
697   PetscFunctionBegin;
698   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
699   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
700   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
701   ierr = ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
702   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
703   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
704   ierr = PetscLogFlops(36.0*a->nz + mbs);CHKERRQ(ierr);
705   PetscFunctionReturn(0);
706 }
707 
708 #undef __FUNCT__
709 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
710 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
711 {
712   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
713   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
714   MatScalar      *aa=a->a;
715   PetscScalar    *x,*b;
716   PetscErrorCode ierr;
717 
718   PetscFunctionBegin;
719   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
720   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
721   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
722   ierr = BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
723   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
724   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
725   ierr = PetscLogFlops(36.0*a->nz);CHKERRQ(ierr);
726   PetscFunctionReturn(0);
727 }
728 
729 #undef __FUNCT__
730 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_inplace"
731 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
732 {
733   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
734   IS             isrow=a->row;
735   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
736   PetscErrorCode ierr;
737   const PetscInt *r;
738   PetscInt       nz,*vj,k,idx;
739   MatScalar      *aa=a->a,*v,*diag;
740   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;
741 
742   PetscFunctionBegin;
743   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
744   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
745   t  = a->solve_work;
746   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
747 
748   /* solve U^T * D * y = b by forward substitution */
749   tp = t;
750   for (k=0; k<mbs; k++) { /* t <- perm(b) */
751     idx   = 5*r[k];
752     tp[0] = b[idx];
753     tp[1] = b[idx+1];
754     tp[2] = b[idx+2];
755     tp[3] = b[idx+3];
756     tp[4] = b[idx+4];
757     tp += 5;
758   }
759 
760   for (k=0; k<mbs; k++){
761     v  = aa + 25*ai[k];
762     vj = aj + ai[k];
763     tp = t + k*5;
764     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
765     nz = ai[k+1] - ai[k];
766 
767     tp = t + (*vj)*5;
768     while (nz--) {
769       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
770       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
771       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
772       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
773       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
774       vj++; tp = t + (*vj)*5;
775       v += 25;
776     }
777 
778     /* xk = inv(Dk)*(Dk*xk) */
779     diag  = aa+k*25;          /* ptr to inv(Dk) */
780     tp    = t + k*5;
781       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
782       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
783       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
784       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
785       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
786   }
787 
788   /* solve U*x = y by back substitution */
789   for (k=mbs-1; k>=0; k--){
790     v  = aa + 25*ai[k];
791     vj = aj + ai[k];
792     tp    = t + k*5;
793     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
794     nz = ai[k+1] - ai[k];
795 
796     tp = t + (*vj)*5;
797     while (nz--) {
798       /* xk += U(k,:)*x(:) */
799       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
800       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
801       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
802       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
803       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
804       vj++; tp = t + (*vj)*5;
805       v += 25;
806     }
807     tp    = t + k*5;
808     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
809     idx   = 5*r[k];
810     x[idx]     = x0;
811     x[idx+1]   = x1;
812     x[idx+2]   = x2;
813     x[idx+3]   = x3;
814     x[idx+4]   = x4;
815   }
816 
817   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
818   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
819   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
820   ierr = PetscLogFlops(25.0*(2.0*a->nz + mbs));CHKERRQ(ierr);
821   PetscFunctionReturn(0);
822 }
823 
824 #undef __FUNCT__
825 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private"
826 PetscErrorCode ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
827 {
828   MatScalar      *v,*diag;
829   PetscScalar    *xp,x0,x1,x2,x3,x4;
830   PetscInt       nz,*vj,k;
831 
832   PetscFunctionBegin;
833   for (k=0; k<mbs; k++){
834     v  = aa + 25*ai[k];
835     xp = x + k*5;
836     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
837     nz = ai[k+1] - ai[k];
838     vj = aj + ai[k];
839     xp = x + (*vj)*5;
840     while (nz--) {
841       /* x(:) += U(k,:)^T*(Dk*xk) */
842       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
843       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
844       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
845       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
846       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
847       vj++; xp = x + (*vj)*5;
848       v += 25;
849     }
850     /* xk = inv(Dk)*(Dk*xk) */
851     diag = aa+k*25;          /* ptr to inv(Dk) */
852     xp   = x + k*5;
853     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
854     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
855     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
856     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
857     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
858   }
859   PetscFunctionReturn(0);
860 }
861 
862 #undef __FUNCT__
863 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private"
864 PetscErrorCode BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
865 {
866   MatScalar      *v;
867   PetscScalar    *xp,x0,x1,x2,x3,x4;
868   PetscInt       nz,*vj,k;
869 
870   PetscFunctionBegin;
871   for (k=mbs-1; k>=0; k--){
872     v  = aa + 25*ai[k];
873     xp = x + k*5;
874     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
875     nz = ai[k+1] - ai[k];
876     vj = aj + ai[k];
877     xp = x + (*vj)*5;
878     while (nz--) {
879       /* xk += U(k,:)*x(:) */
880       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
881       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
882       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
883       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
884       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
885       vj++;
886       v += 25; xp = x + (*vj)*5;
887     }
888     xp = x + k*5;
889     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
890   }
891   PetscFunctionReturn(0);
892 }
893 
894 #undef __FUNCT__
895 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
896 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
897 {
898   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
899   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
900   MatScalar      *aa=a->a;
901   PetscScalar    *x,*b;
902   PetscErrorCode ierr;
903 
904   PetscFunctionBegin;
905   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
906   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
907 
908   /* solve U^T * D * y = b by forward substitution */
909   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
910   ierr = ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
911 
912   /* solve U*x = y by back substitution */
913   ierr = BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
914 
915   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
916   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
917   ierr = PetscLogFlops(25.0*(2.0*a->nz + mbs));CHKERRQ(ierr);
918   PetscFunctionReturn(0);
919 }
920 
921 #undef __FUNCT__
922 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
923 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
924 {
925   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
926   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
927   MatScalar      *aa=a->a;
928   PetscScalar    *x,*b;
929   PetscErrorCode ierr;
930 
931   PetscFunctionBegin;
932   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
933   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
934   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
935   ierr = ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
936   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
937   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
938   ierr = PetscLogFlops(25.0*(a->nz + mbs));CHKERRQ(ierr);
939   PetscFunctionReturn(0);
940 }
941 
942 #undef __FUNCT__
943 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
944 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
945 {
946   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
947   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
948   MatScalar      *aa=a->a;
949   PetscScalar    *x,*b;
950   PetscErrorCode ierr;
951 
952   PetscFunctionBegin;
953   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
954   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
955   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
956   ierr = BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
957   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
958   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
959   ierr = PetscLogFlops(25.0*a->nz);CHKERRQ(ierr);
960   PetscFunctionReturn(0);
961 }
962 
963 #undef __FUNCT__
964 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_inplace"
965 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
966 {
967   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
968   IS             isrow=a->row;
969   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
970   PetscErrorCode ierr;
971   const PetscInt *r;
972   PetscInt       nz,*vj,k,idx;
973   MatScalar      *aa=a->a,*v,*diag;
974   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;
975 
976   PetscFunctionBegin;
977   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
978   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
979   t  = a->solve_work;
980   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
981 
982   /* solve U^T * D * y = b by forward substitution */
983   tp = t;
984   for (k=0; k<mbs; k++) { /* t <- perm(b) */
985     idx   = 4*r[k];
986     tp[0] = b[idx];
987     tp[1] = b[idx+1];
988     tp[2] = b[idx+2];
989     tp[3] = b[idx+3];
990     tp += 4;
991   }
992 
993   for (k=0; k<mbs; k++){
994     v  = aa + 16*ai[k];
995     vj = aj + ai[k];
996     tp = t + k*4;
997     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
998     nz = ai[k+1] - ai[k];
999 
1000     tp = t + (*vj)*4;
1001     while (nz--) {
1002       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1003       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1004       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1005       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1006       vj++; tp = t + (*vj)*4;
1007       v += 16;
1008     }
1009 
1010     /* xk = inv(Dk)*(Dk*xk) */
1011     diag  = aa+k*16;          /* ptr to inv(Dk) */
1012     tp    = t + k*4;
1013     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1014     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1015     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1016     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1017   }
1018 
1019   /* solve U*x = y by back substitution */
1020   for (k=mbs-1; k>=0; k--){
1021     v  = aa + 16*ai[k];
1022     vj = aj + ai[k];
1023     tp    = t + k*4;
1024     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1025     nz = ai[k+1] - ai[k];
1026 
1027     tp = t + (*vj)*4;
1028     while (nz--) {
1029       /* xk += U(k,:)*x(:) */
1030       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1031       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1032       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1033       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1034       vj++; tp = t + (*vj)*4;
1035       v += 16;
1036     }
1037     tp    = t + k*4;
1038     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1039     idx        = 4*r[k];
1040     x[idx]     = x0;
1041     x[idx+1]   = x1;
1042     x[idx+2]   = x2;
1043     x[idx+3]   = x3;
1044   }
1045 
1046   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1047   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1048   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1049   ierr = PetscLogFlops(16.0*(2.0*a->nz + mbs));CHKERRQ(ierr);
1050   PetscFunctionReturn(0);
1051 }
1052 
1053 #undef __FUNCT__
1054 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private"
1055 PetscErrorCode ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1056 {
1057   MatScalar      *v,*diag;
1058   PetscScalar    *xp,x0,x1,x2,x3;
1059   PetscInt       nz,*vj,k;
1060 
1061   PetscFunctionBegin;
1062   for (k=0; k<mbs; k++){
1063     v  = aa + 16*ai[k];
1064     xp = x + k*4;
1065     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1066     nz = ai[k+1] - ai[k];
1067     vj = aj + ai[k];
1068     xp = x + (*vj)*4;
1069     while (nz--) {
1070       /* x(:) += U(k,:)^T*(Dk*xk) */
1071       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1072       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1073       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1074       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1075       vj++; xp = x + (*vj)*4;
1076       v += 16;
1077     }
1078     /* xk = inv(Dk)*(Dk*xk) */
1079     diag = aa+k*16;          /* ptr to inv(Dk) */
1080     xp   = x + k*4;
1081     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1082     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1083     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1084     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1085   }
1086   PetscFunctionReturn(0);
1087 }
1088 
1089 #undef __FUNCT__
1090 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private"
1091 PetscErrorCode BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1092 {
1093   MatScalar      *v;
1094   PetscScalar    *xp,x0,x1,x2,x3;
1095   PetscInt       nz,*vj,k;
1096 
1097   PetscFunctionBegin;
1098   for (k=mbs-1; k>=0; k--){
1099     v  = aa + 16*ai[k];
1100     xp = x + k*4;
1101     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1102     nz = ai[k+1] - ai[k];
1103     vj = aj + ai[k];
1104     xp = x + (*vj)*4;
1105     while (nz--) {
1106       /* xk += U(k,:)*x(:) */
1107       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1108       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1109       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1110       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1111       vj++;
1112       v += 16; xp = x + (*vj)*4;
1113     }
1114     xp = x + k*4;
1115     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1116   }
1117   PetscFunctionReturn(0);
1118 }
1119 
1120 #undef __FUNCT__
1121 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1122 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1123 {
1124   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1125   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1126   MatScalar      *aa=a->a;
1127   PetscScalar    *x,*b;
1128   PetscErrorCode ierr;
1129 
1130   PetscFunctionBegin;
1131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1133 
1134   /* solve U^T * D * y = b by forward substitution */
1135   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
1136   ierr = ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1137 
1138   /* solve U*x = y by back substitution */
1139   ierr = BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1140   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1141   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1142   ierr = PetscLogFlops(16.0*(2.0*a->nz + mbs));CHKERRQ(ierr);
1143   PetscFunctionReturn(0);
1144 }
1145 
1146 #undef __FUNCT__
1147 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1148 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1149 {
1150   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1151   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1152   MatScalar      *aa=a->a;
1153   PetscScalar    *x,*b;
1154   PetscErrorCode ierr;
1155 
1156   PetscFunctionBegin;
1157   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1158   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1159   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
1160   ierr = ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1161   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1162   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1163   ierr = PetscLogFlops(16.0*a->nz + mbs);CHKERRQ(ierr);
1164   PetscFunctionReturn(0);
1165 }
1166 
1167 #undef __FUNCT__
1168 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1169 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1170 {
1171   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1172   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1173   MatScalar      *aa=a->a;
1174   PetscScalar    *x,*b;
1175   PetscErrorCode ierr;
1176 
1177   PetscFunctionBegin;
1178   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1179   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1180   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1181   ierr = BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1182   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1183   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1184   ierr = PetscLogFlops(16.0*a->nz);CHKERRQ(ierr);
1185   PetscFunctionReturn(0);
1186 }
1187 
1188 #undef __FUNCT__
1189 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_inplace"
1190 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1191 {
1192   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1193   IS             isrow=a->row;
1194   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1195   PetscErrorCode ierr;
1196   const PetscInt *r;
1197   PetscInt       nz,*vj,k,idx;
1198   MatScalar      *aa=a->a,*v,*diag;
1199   PetscScalar    *x,*b,x0,x1,x2,*t,*tp;
1200 
1201   PetscFunctionBegin;
1202   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1203   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1204   t  = a->solve_work;
1205   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1206 
1207   /* solve U^T * D * y = b by forward substitution */
1208   tp = t;
1209   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1210     idx   = 3*r[k];
1211     tp[0] = b[idx];
1212     tp[1] = b[idx+1];
1213     tp[2] = b[idx+2];
1214     tp += 3;
1215   }
1216 
1217   for (k=0; k<mbs; k++){
1218     v  = aa + 9*ai[k];
1219     vj = aj + ai[k];
1220     tp = t + k*3;
1221     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1222     nz = ai[k+1] - ai[k];
1223 
1224     tp = t + (*vj)*3;
1225     while (nz--) {
1226       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1227       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1228       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1229       vj++; tp = t + (*vj)*3;
1230       v += 9;
1231     }
1232 
1233     /* xk = inv(Dk)*(Dk*xk) */
1234     diag  = aa+k*9;          /* ptr to inv(Dk) */
1235     tp    = t + k*3;
1236     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1237     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1238     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1239   }
1240 
1241   /* solve U*x = y by back substitution */
1242   for (k=mbs-1; k>=0; k--){
1243     v  = aa + 9*ai[k];
1244     vj = aj + ai[k];
1245     tp    = t + k*3;
1246     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1247     nz = ai[k+1] - ai[k];
1248 
1249     tp = t + (*vj)*3;
1250     while (nz--) {
1251       /* xk += U(k,:)*x(:) */
1252       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1253       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1254       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1255       vj++; tp = t + (*vj)*3;
1256       v += 9;
1257     }
1258     tp    = t + k*3;
1259     tp[0] = x0; tp[1] = x1; tp[2] = x2;
1260     idx      = 3*r[k];
1261     x[idx]   = x0;
1262     x[idx+1] = x1;
1263     x[idx+2] = x2;
1264   }
1265 
1266   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1267   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1268   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1269   ierr = PetscLogFlops(9.0*(2.0*a->nz + mbs));CHKERRQ(ierr);
1270   PetscFunctionReturn(0);
1271 }
1272 
1273 #undef __FUNCT__
1274 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private"
1275 PetscErrorCode ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1276 {
1277   MatScalar      *v,*diag;
1278   PetscScalar    *xp,x0,x1,x2;
1279   PetscInt       nz,*vj,k;
1280 
1281   PetscFunctionBegin;
1282   for (k=0; k<mbs; k++){
1283     v  = aa + 9*ai[k];
1284     xp = x + k*3;
1285     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1286     nz = ai[k+1] - ai[k];
1287     vj = aj + ai[k];
1288     xp = x + (*vj)*3;
1289     while (nz--) {
1290       /* x(:) += U(k,:)^T*(Dk*xk) */
1291       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1292       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1293       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1294       vj++; xp = x + (*vj)*3;
1295       v += 9;
1296     }
1297     /* xk = inv(Dk)*(Dk*xk) */
1298     diag = aa+k*9;          /* ptr to inv(Dk) */
1299     xp   = x + k*3;
1300     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1301     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1302     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1303   }
1304   PetscFunctionReturn(0);
1305 }
1306 
1307 #undef __FUNCT__
1308 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private"
1309 PetscErrorCode BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1310 {
1311   MatScalar      *v;
1312   PetscScalar    *xp,x0,x1,x2;
1313   PetscInt       nz,*vj,k;
1314 
1315   PetscFunctionBegin;
1316   for (k=mbs-1; k>=0; k--){
1317     v  = aa + 9*ai[k];
1318     xp = x + k*3;
1319     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1320     nz = ai[k+1] - ai[k];
1321     vj = aj + ai[k];
1322     xp = x + (*vj)*3;
1323     while (nz--) {
1324       /* xk += U(k,:)*x(:) */
1325       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1326       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1327       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1328       vj++;
1329       v += 9; xp = x + (*vj)*3;
1330     }
1331     xp = x + k*3;
1332     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1333   }
1334   PetscFunctionReturn(0);
1335 }
1336 
1337 #undef __FUNCT__
1338 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1339 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1340 {
1341   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1342   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1343   MatScalar      *aa=a->a;
1344   PetscScalar    *x,*b;
1345   PetscErrorCode ierr;
1346 
1347   PetscFunctionBegin;
1348   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1349   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1350 
1351   /* solve U^T * D * y = b by forward substitution */
1352   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1353   ierr = ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1354 
1355   /* solve U*x = y by back substitution */
1356   ierr = BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1357 
1358   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1359   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1360   ierr = PetscLogFlops(9.0*(2.0*a->nz + mbs));CHKERRQ(ierr);
1361   PetscFunctionReturn(0);
1362 }
1363 
1364 #undef __FUNCT__
1365 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1366 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1367 {
1368   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1369   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1370   MatScalar      *aa=a->a;
1371   PetscScalar    *x,*b;
1372   PetscErrorCode ierr;
1373 
1374   PetscFunctionBegin;
1375   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1376   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1377   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1378   ierr = ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1379   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1380   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1381   ierr = PetscLogFlops(9.0*(a->nz + mbs));CHKERRQ(ierr);
1382   PetscFunctionReturn(0);
1383 }
1384 
1385 #undef __FUNCT__
1386 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1387 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1388 {
1389   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1390   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1391   MatScalar      *aa=a->a;
1392   PetscScalar    *x,*b;
1393   PetscErrorCode ierr;
1394 
1395   PetscFunctionBegin;
1396   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1397   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1398   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1399   ierr = BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1400   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1401   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1402   ierr = PetscLogFlops(9.0*a->nz);CHKERRQ(ierr);
1403   PetscFunctionReturn(0);
1404 }
1405 
1406 #undef __FUNCT__
1407 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_inplace"
1408 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1409 {
1410   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ *)A->data;
1411   IS             isrow=a->row;
1412   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1413   PetscErrorCode ierr;
1414   const PetscInt *r;
1415   PetscInt       nz,*vj,k,k2,idx;
1416   MatScalar      *aa=a->a,*v,*diag;
1417   PetscScalar    *x,*b,x0,x1,*t;
1418 
1419   PetscFunctionBegin;
1420   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1421   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1422   t  = a->solve_work;
1423   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1424 
1425   /* solve U^T * D * y = perm(b) by forward substitution */
1426   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1427     idx = 2*r[k];
1428     t[k*2]   = b[idx];
1429     t[k*2+1] = b[idx+1];
1430   }
1431   for (k=0; k<mbs; k++){
1432     v  = aa + 4*ai[k];
1433     vj = aj + ai[k];
1434     k2 = k*2;
1435     x0 = t[k2]; x1 = t[k2+1];
1436     nz = ai[k+1] - ai[k];
1437     while (nz--) {
1438       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1439       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1440       vj++; v += 4;
1441     }
1442     diag = aa+k*4;  /* ptr to inv(Dk) */
1443     t[k2]   = diag[0]*x0 + diag[2]*x1;
1444     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1445   }
1446 
1447   /* solve U*x = y by back substitution */
1448   for (k=mbs-1; k>=0; k--){
1449     v  = aa + 4*ai[k];
1450     vj = aj + ai[k];
1451     k2 = k*2;
1452     x0 = t[k2]; x1 = t[k2+1];
1453     nz = ai[k+1] - ai[k];
1454     while (nz--) {
1455       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1456       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1457       vj++; v += 4;
1458     }
1459     t[k2]    = x0;
1460     t[k2+1]  = x1;
1461     idx      = 2*r[k];
1462     x[idx]   = x0;
1463     x[idx+1] = x1;
1464   }
1465 
1466   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1467   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1468   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1469   ierr = PetscLogFlops(4.0*(2.0*a->nz + mbs));CHKERRQ(ierr);
1470   PetscFunctionReturn(0);
1471 }
1472 
1473 #undef __FUNCT__
1474 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private"
1475 PetscErrorCode ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1476 {
1477   MatScalar      *v,*diag;
1478   PetscScalar    x0,x1;
1479   PetscInt       nz,*vj,k,k2;
1480 
1481   PetscFunctionBegin;
1482   for (k=0; k<mbs; k++){
1483     v  = aa + 4*ai[k];
1484     vj = aj + ai[k];
1485     k2 = k*2;
1486     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1487     nz = ai[k+1] - ai[k];
1488 
1489     while (nz--) {
1490       /* x(:) += U(k,:)^T*(Dk*xk) */
1491       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1492       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1493       vj++; v += 4;
1494     }
1495     /* xk = inv(Dk)*(Dk*xk) */
1496     diag = aa+k*4;          /* ptr to inv(Dk) */
1497     x[k2]   = diag[0]*x0 + diag[2]*x1;
1498     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1499   }
1500   PetscFunctionReturn(0);
1501 }
1502 
1503 #undef __FUNCT__
1504 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private"
1505 PetscErrorCode BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1506 {
1507   MatScalar      *v;
1508   PetscScalar    x0,x1;
1509   PetscInt       nz,*vj,k,k2;
1510 
1511   PetscFunctionBegin;
1512   for (k=mbs-1; k>=0; k--){
1513     v  = aa + 4*ai[k];
1514     vj = aj + ai[k];
1515     k2 = k*2;
1516     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1517     nz = ai[k+1] - ai[k];
1518     while (nz--) {
1519       /* xk += U(k,:)*x(:) */
1520       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1521       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1522       vj++; v += 4;
1523     }
1524     x[k2]     = x0;
1525     x[k2+1]   = x1;
1526   }
1527   PetscFunctionReturn(0);
1528 }
1529 
1530 #undef __FUNCT__
1531 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1532 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1533 {
1534   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1535   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1536   MatScalar      *aa=a->a;
1537   PetscScalar    *x,*b;
1538   PetscErrorCode ierr;
1539 
1540   PetscFunctionBegin;
1541   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1542   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1543 
1544   /* solve U^T * D * y = b by forward substitution */
1545   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1546   ierr = ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1547 
1548   /* solve U*x = y by back substitution */
1549   ierr = BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1550 
1551   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1552   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1553   ierr = PetscLogFlops(4.0*(2.0*a->nz + mbs));CHKERRQ(ierr); /* bs2*(2*a->nz + mbs) */
1554   PetscFunctionReturn(0);
1555 }
1556 
1557 #undef __FUNCT__
1558 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1559 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1560 {
1561   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1562   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1563   MatScalar      *aa=a->a;
1564   PetscScalar    *x,*b;
1565   PetscErrorCode ierr;
1566 
1567   PetscFunctionBegin;
1568   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1569   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1570   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1571   ierr = ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1572   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1573   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1574   ierr = PetscLogFlops(4.0*(a->nz + mbs));CHKERRQ(ierr);
1575   PetscFunctionReturn(0);
1576 }
1577 
1578 #undef __FUNCT__
1579 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1580 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1581 {
1582   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1583   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1584   MatScalar      *aa=a->a;
1585   PetscScalar    *x,*b;
1586   PetscErrorCode ierr;
1587 
1588   PetscFunctionBegin;
1589   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1590   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1591   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1592   ierr = BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1593   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1594   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1595   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1596   PetscFunctionReturn(0);
1597 }
1598 
1599 #undef __FUNCT__
1600 #define __FUNCT__ "MatSolve_SeqSBAIJ_1"
1601 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1602 {
1603   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1604   IS                isrow=a->row;
1605   PetscErrorCode    ierr;
1606   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1607   const MatScalar   *aa=a->a,*v;
1608   const PetscScalar *b;
1609   PetscScalar       *x,xk,*t;
1610   PetscInt          nz,k,j;
1611 
1612   PetscFunctionBegin;
1613   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1614   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1615   t    = a->solve_work;
1616   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1617 
1618   /* solve U^T*D^(1/2)*y = perm(b) by forward substitution */
1619   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1620   for (k=0; k<mbs; k++){
1621     v  = aa + ai[k];
1622     vj = aj + ai[k];
1623     xk = t[k];
1624     nz = ai[k+1] - ai[k] - 1;
1625     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1626     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1627   }
1628 
1629   /* solve U*perm(x) = y by back substitution */
1630   for (k=mbs-1; k>=0; k--){
1631     v  = aa + adiag[k] - 1;
1632     vj = aj + adiag[k] - 1;
1633     nz = ai[k+1] - ai[k] - 1;
1634     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1635     x[rp[k]] = t[k];
1636   }
1637 
1638   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1639   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1640   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1641   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1642   PetscFunctionReturn(0);
1643 }
1644 
1645 #undef __FUNCT__
1646 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_inplace"
1647 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1648 {
1649   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1650   IS              isrow=a->row;
1651   PetscErrorCode  ierr;
1652   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1653   const MatScalar *aa=a->a,*v;
1654   PetscScalar     *x,*b,xk,*t;
1655   PetscInt        nz,k;
1656 
1657   PetscFunctionBegin;
1658   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1659   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1660   t    = a->solve_work;
1661   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1662 
1663   /* solve U^T*D^(1/2)*y = perm(b) by forward substitution */
1664   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1665   for (k=0; k<mbs; k++){
1666     v  = aa + ai[k] + 1;
1667     vj = aj + ai[k] + 1;
1668     xk = t[k];
1669     nz = ai[k+1] - ai[k] - 1;
1670     while (nz--) t[*vj++] += (*v++) * xk;
1671     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1672   }
1673 
1674   /* solve U*perm(x) = y by back substitution */
1675   for (k=mbs-1; k>=0; k--){
1676     v  = aa + ai[k] + 1;
1677     vj = aj + ai[k] + 1;
1678     nz = ai[k+1] - ai[k] - 1;
1679     while (nz--) t[k] += (*v++) * t[*vj++];
1680     x[rp[k]] = t[k];
1681   }
1682 
1683   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1684   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1685   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1686   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1687   PetscFunctionReturn(0);
1688 }
1689 
1690 #undef __FUNCT__
1691 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1"
1692 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1693 {
1694   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1695   IS              isrow=a->row;
1696   PetscErrorCode  ierr;
1697   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1698   const MatScalar *aa=a->a,*v;
1699   PetscReal       diagk;
1700   PetscScalar     *x,*b,xk;
1701   PetscInt        nz,k;
1702 
1703   PetscFunctionBegin;
1704   SETERRQ(1,"Not done yet");
1705   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1706   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1707   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1708   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1709 
1710   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1711   for (k=0; k<mbs; k++){
1712     v  = aa + ai[k] + 1;
1713     vj = aj + ai[k] + 1;
1714     xk = x[k];
1715     nz = ai[k+1] - ai[k] - 1;
1716     while (nz--) x[*vj++] += (*v++) * xk;
1717 
1718     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1719     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1720     x[k] = xk*sqrt(diagk);
1721   }
1722   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1723   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1724   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1725   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1726   PetscFunctionReturn(0);
1727 }
1728 
1729 #undef __FUNCT__
1730 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_inplace"
1731 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1732 {
1733   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1734   IS              isrow=a->row;
1735   PetscErrorCode  ierr;
1736   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1737   const MatScalar *aa=a->a,*v;
1738   PetscReal       diagk;
1739   PetscScalar     *x,*b,xk;
1740   PetscInt        nz,k;
1741 
1742   PetscFunctionBegin;
1743   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1744   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1745   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1746   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1747 
1748   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1749   for (k=0; k<mbs; k++){
1750     v  = aa + ai[k] + 1;
1751     vj = aj + ai[k] + 1;
1752     xk = x[k];
1753     nz = ai[k+1] - ai[k] - 1;
1754     while (nz--) x[*vj++] += (*v++) * xk;
1755 
1756     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1757     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1758     x[k] = xk*sqrt(diagk);
1759   }
1760   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1761   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1762   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1763   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1764   PetscFunctionReturn(0);
1765 }
1766 
1767 #undef __FUNCT__
1768 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1"
1769 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1770 {
1771   SETERRQ(1,"Not done yet");
1772 }
1773 
1774 #undef __FUNCT__
1775 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_inplace"
1776 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1777 {
1778   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1779   IS              isrow=a->row;
1780   PetscErrorCode  ierr;
1781   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1782   const MatScalar *aa=a->a,*v;
1783   PetscReal       diagk;
1784   PetscScalar     *x,*b,*t;
1785   PetscInt        nz,k;
1786 
1787   PetscFunctionBegin;
1788   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1789   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1790   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1791   t    = a->solve_work;
1792   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1793 
1794   for (k=mbs-1; k>=0; k--){
1795     v  = aa + ai[k] + 1;
1796     vj = aj + ai[k] + 1;
1797     diagk = PetscRealPart(aa[ai[k]]);
1798     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1799     t[k] = b[k] * sqrt(diagk);
1800     nz = ai[k+1] - ai[k] - 1;
1801     while (nz--) t[k] += (*v++) * t[*vj++];
1802     x[rp[k]] = t[k];
1803   }
1804   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1805   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1806   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1807   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1808   PetscFunctionReturn(0);
1809 }
1810 
1811 #undef __FUNCT__
1812 #define __FUNCT__ "MatSolves_SeqSBAIJ_1_inplace"
1813 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1814 {
1815   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1816   PetscErrorCode ierr;
1817 
1818   PetscFunctionBegin;
1819   if (A->rmap->bs == 1) {
1820     ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr);
1821   } else {
1822     IS              isrow=a->row;
1823     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1824     const MatScalar *aa=a->a,*v;
1825     PetscScalar     *x,*b,*t;
1826     PetscInt        nz,k,n,i;
1827     if (bb->n > a->solves_work_n) {
1828       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1829       ierr = PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);CHKERRQ(ierr);
1830       a->solves_work_n = bb->n;
1831     }
1832     n    = bb->n;
1833     ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr);
1834     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1835     t    = a->solves_work;
1836 
1837     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1838 
1839     /* solve U^T*D*y = perm(b) by forward substitution */
1840     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1841     for (k=0; k<mbs; k++){
1842       v  = aa + ai[k];
1843       vj = aj + ai[k];
1844       nz = ai[k+1] - ai[k];
1845       while (nz--) {
1846         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1847         v++;vj++;
1848       }
1849       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1850     }
1851 
1852     /* solve U*perm(x) = y by back substitution */
1853     for (k=mbs-1; k>=0; k--){
1854       v  = aa + ai[k];
1855       vj = aj + ai[k];
1856       nz = ai[k+1] - ai[k];
1857       while (nz--) {
1858         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1859         v++;vj++;
1860       }
1861       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1862     }
1863 
1864     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1865     ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr);
1866     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1867     ierr = PetscLogFlops(bb->n*(4.0*a->nz + A->rmap->N));CHKERRQ(ierr);
1868   }
1869   PetscFunctionReturn(0);
1870 }
1871 
1872 #undef __FUNCT__
1873 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering"
1874 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1875 {
1876   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1877   PetscErrorCode    ierr;
1878   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
1879   const MatScalar   *aa=a->a,*v;
1880   const PetscScalar *b;
1881   PetscScalar       *x,xi;
1882   PetscInt          nz,i,j;
1883 
1884   PetscFunctionBegin;
1885   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1886   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1887 
1888   /* solve U^T*D*y = b by forward substitution */
1889   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1890   for (i=0; i<mbs; i++){
1891     v  = aa + ai[i];
1892     vj = aj + ai[i];
1893     xi = x[i];
1894     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1895     for (j=0; j<nz; j++){
1896       x[vj[j]] += v[j]* xi;
1897     }
1898     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1899   }
1900 
1901   /* solve U*x = y by backward substitution */
1902   for (i=mbs-2; i>=0; i--){
1903     xi = x[i];
1904     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1905     vj = aj + adiag[i] - 1;
1906     nz = ai[i+1] - ai[i] - 1;
1907     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1908     x[i] = xi;
1909   }
1910 
1911   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1912   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1913   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1914   PetscFunctionReturn(0);
1915 }
1916 
1917 #undef __FUNCT__
1918 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
1919 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1920 {
1921   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1922   PetscErrorCode ierr;
1923   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1924   MatScalar      *aa=a->a,*v;
1925   PetscScalar    *x,*b,xk;
1926   PetscInt       nz,*vj,k;
1927 
1928   PetscFunctionBegin;
1929   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1930   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1931 
1932   /* solve U^T*D*y = b by forward substitution */
1933   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1934   for (k=0; k<mbs; k++){
1935     v  = aa + ai[k] + 1;
1936     vj = aj + ai[k] + 1;
1937     xk = x[k];
1938     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
1939     while (nz--) x[*vj++] += (*v++) * xk;
1940     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
1941   }
1942 
1943   /* solve U*x = y by back substitution */
1944   for (k=mbs-2; k>=0; k--){
1945     v  = aa + ai[k] + 1;
1946     vj = aj + ai[k] + 1;
1947     xk = x[k];
1948     nz = ai[k+1] - ai[k] - 1;
1949     while (nz--) {
1950       xk += (*v++) * x[*vj++];
1951     }
1952     x[k] = xk;
1953   }
1954 
1955   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1956   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1957   ierr = PetscLogFlops(4.0*a->nz);CHKERRQ(ierr);
1958   PetscFunctionReturn(0);
1959 }
1960 
1961 #undef __FUNCT__
1962 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering"
1963 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1964 {
1965   SETERRQ(1,"Not done yet");
1966 }
1967 
1968 #undef __FUNCT__
1969 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
1970 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1971 {
1972   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1973   PetscErrorCode ierr;
1974   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
1975   MatScalar      *aa=a->a,*v;
1976   PetscReal      diagk;
1977   PetscScalar    *x,*b;
1978   PetscInt       nz,*vj,k;
1979 
1980   PetscFunctionBegin;
1981   /* solve U^T*D^(1/2)*x = b by forward substitution */
1982   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1983   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1984   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1985   for (k=0; k<mbs; k++){
1986     v  = aa + ai[k] + 1;
1987     vj = aj + ai[k] + 1;
1988     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
1989     while (nz--) x[*vj++] += (*v++) * x[k];
1990     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1991     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
1992     x[k] *= sqrt(diagk);
1993   }
1994   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1995   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1996   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
1997   PetscFunctionReturn(0);
1998 }
1999 
2000 #undef __FUNCT__
2001 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering"
2002 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2003 {
2004   SETERRQ(1,"Not done yet");
2005 }
2006 
2007 #undef __FUNCT__
2008 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2009 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2010 {
2011   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2012   PetscErrorCode ierr;
2013   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2014   MatScalar      *aa=a->a,*v;
2015   PetscReal      diagk;
2016   PetscScalar    *x,*b;
2017   PetscInt       nz,*vj,k;
2018 
2019   PetscFunctionBegin;
2020   /* solve D^(1/2)*U*x = b by back substitution */
2021   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2022   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2023 
2024   for (k=mbs-1; k>=0; k--){
2025     v  = aa + ai[k] + 1;
2026     vj = aj + ai[k] + 1;
2027     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2028     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2029     x[k] = sqrt(diagk)*b[k];
2030     nz = ai[k+1] - ai[k] - 1;
2031     while (nz--) x[k] += (*v++) * x[*vj++];
2032   }
2033   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2034   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2035   ierr = PetscLogFlops(2.0*a->nz);CHKERRQ(ierr);
2036   PetscFunctionReturn(0);
2037 }
2038 
2039 extern PetscErrorCode MatSeqSBAIJSetNumericFactorization(Mat,PetscTruth);
2040 
2041 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2042 #undef __FUNCT__
2043 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_MSR"
2044 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2045 {
2046   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2047   PetscErrorCode ierr;
2048   const PetscInt *rip,mbs = a->mbs,*ai = a->i,*aj = a->j;
2049   PetscInt       *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2050   PetscInt       m,reallocs = 0,*levtmp;
2051   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2052   PetscInt       incrlev,*lev,shift,prow,nz;
2053   PetscReal      f = info->fill,levels = info->levels;
2054   PetscTruth     perm_identity;
2055 
2056   PetscFunctionBegin;
2057   /* check whether perm is the identity mapping */
2058   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2059 
2060   if (perm_identity){
2061     a->permute = PETSC_FALSE;
2062     ai = a->i; aj = a->j;
2063   } else { /*  non-trivial permutation */
2064     a->permute = PETSC_TRUE;
2065     ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr);
2066     ai = a->inew; aj = a->jnew;
2067   }
2068 
2069   /* initialization */
2070   ierr  = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2071   umax  = (PetscInt)(f*ai[mbs] + 1);
2072   ierr  = PetscMalloc(umax*sizeof(PetscInt),&lev);CHKERRQ(ierr);
2073   umax += mbs + 1;
2074   shift = mbs + 1;
2075   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
2076   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
2077   iu[0] = mbs + 1;
2078   juidx = mbs + 1;
2079   /* prowl: linked list for pivot row */
2080   ierr    = PetscMalloc3(mbs,PetscInt,&prowl,mbs,PetscInt,&q,mbs,PetscInt,&levtmp);CHKERRQ(ierr);
2081   /* q: linked list for col index */
2082 
2083   for (i=0; i<mbs; i++){
2084     prowl[i] = mbs;
2085     q[i] = 0;
2086   }
2087 
2088   /* for each row k */
2089   for (k=0; k<mbs; k++){
2090     nzk  = 0;
2091     q[k] = mbs;
2092     /* copy current row into linked list */
2093     nz = ai[rip[k]+1] - ai[rip[k]];
2094     j = ai[rip[k]];
2095     while (nz--){
2096       vj = rip[aj[j++]];
2097       if (vj > k){
2098         qm = k;
2099         do {
2100           m  = qm; qm = q[m];
2101         } while(qm < vj);
2102         if (qm == vj) {
2103           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
2104         }
2105         nzk++;
2106         q[m]       = vj;
2107         q[vj]      = qm;
2108         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2109       }
2110     }
2111 
2112     /* modify nonzero structure of k-th row by computing fill-in
2113        for each row prow to be merged in */
2114     prow = k;
2115     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2116 
2117     while (prow < k){
2118       /* merge row prow into k-th row */
2119       jmin = iu[prow] + 1;
2120       jmax = iu[prow+1];
2121       qm = k;
2122       for (j=jmin; j<jmax; j++){
2123         incrlev = lev[j-shift] + 1;
2124 	if (incrlev > levels) continue;
2125         vj      = ju[j];
2126         do {
2127           m = qm; qm = q[m];
2128         } while (qm < vj);
2129         if (qm != vj){      /* a fill */
2130           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2131           levtmp[vj] = incrlev;
2132         } else {
2133           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2134         }
2135       }
2136       prow = prowl[prow]; /* next pivot row */
2137     }
2138 
2139     /* add k to row list for first nonzero element in k-th row */
2140     if (nzk > 1){
2141       i = q[k]; /* col value of first nonzero element in k_th row of U */
2142       prowl[k] = prowl[i]; prowl[i] = k;
2143     }
2144     iu[k+1] = iu[k] + nzk;
2145 
2146     /* allocate more space to ju and lev if needed */
2147     if (iu[k+1] > umax) {
2148       /* estimate how much additional space we will need */
2149       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2150       /* just double the memory each time */
2151       maxadd = umax;
2152       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2153       umax += maxadd;
2154 
2155       /* allocate a longer ju */
2156       ierr     = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
2157       ierr     = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
2158       ierr     = PetscFree(ju);CHKERRQ(ierr);
2159       ju       = jutmp;
2160 
2161       ierr     = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
2162       ierr     = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));CHKERRQ(ierr);
2163       ierr     = PetscFree(lev);CHKERRQ(ierr);
2164       lev      = jutmp;
2165       reallocs += 2; /* count how many times we realloc */
2166     }
2167 
2168     /* save nonzero structure of k-th row in ju */
2169     i=k;
2170     while (nzk --) {
2171       i                = q[i];
2172       ju[juidx]        = i;
2173       lev[juidx-shift] = levtmp[i];
2174       juidx++;
2175     }
2176   }
2177 
2178 #if defined(PETSC_USE_INFO)
2179   if (ai[mbs] != 0) {
2180     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2181     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
2182     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
2183     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
2184     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
2185   } else {
2186     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2187   }
2188 #endif
2189 
2190   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2191   ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr);
2192   ierr = PetscFree(lev);CHKERRQ(ierr);
2193 
2194   /* put together the new matrix */
2195   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);CHKERRQ(ierr);
2196 
2197   /* ierr = PetscLogObjectParent(B,iperm);CHKERRQ(ierr); */
2198   b    = (Mat_SeqSBAIJ*)(B)->data;
2199   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2200   b->singlemalloc = PETSC_FALSE;
2201   b->free_a       = PETSC_TRUE;
2202   b->free_ij      = PETSC_TRUE;
2203   /* the next line frees the default space generated by the Create() */
2204   ierr    = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
2205   ierr    = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
2206   b->j    = ju;
2207   b->i    = iu;
2208   b->diag = 0;
2209   b->ilen = 0;
2210   b->imax = 0;
2211 
2212   if (b->row) {
2213     ierr = ISDestroy(b->row);CHKERRQ(ierr);
2214   }
2215   if (b->icol) {
2216     ierr = ISDestroy(b->icol);CHKERRQ(ierr);
2217   }
2218   b->row  = perm;
2219   b->icol = perm;
2220   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2221   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2222   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2223   /* In b structure:  Free imax, ilen, old a, old j.
2224      Allocate idnew, solve_work, new a, new j */
2225   ierr    = PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2226   b->maxnz = b->nz = iu[mbs];
2227 
2228   (B)->info.factor_mallocs    = reallocs;
2229   (B)->info.fill_ratio_given  = f;
2230   if (ai[mbs] != 0) {
2231     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2232   } else {
2233     (B)->info.fill_ratio_needed = 0.0;
2234   }
2235   ierr = MatSeqSBAIJSetNumericFactorization(B,perm_identity);CHKERRQ(ierr);
2236   PetscFunctionReturn(0);
2237 }
2238 
2239 #include "petscbt.h"
2240 #include "../src/mat/utils/freespace.h"
2241 #undef __FUNCT__
2242 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ"
2243 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2244 {
2245   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2246   Mat_SeqSBAIJ       *b;
2247   PetscErrorCode     ierr;
2248   PetscTruth         perm_identity,free_ij = PETSC_TRUE,missing;
2249   PetscInt           bs=A->rmap->bs,am=a->mbs,d;
2250   const PetscInt     *cols,*rip,*ai,*aj;
2251   PetscInt           reallocs=0,i,*ui;
2252   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2253   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2254   PetscReal          fill=info->fill,levels=info->levels,ratio_needed;
2255   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2256   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2257   PetscBT            lnkbt;
2258 
2259   PetscFunctionBegin;
2260   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
2261   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2262 
2263   /*
2264    This code originally uses Modified Sparse Row (MSR) storage
2265    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2266    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2267    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2268    thus the original code in MSR format is still used for these cases.
2269    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2270    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2271   */
2272   if (bs > 1){
2273     ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
2274     PetscFunctionReturn(0);
2275   }
2276 
2277   /* check whether perm is the identity mapping */
2278   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2279 
2280   /* special case that simply copies fill pattern */
2281   if (!levels && perm_identity) {
2282     a->permute = PETSC_FALSE;
2283     /* reuse the column pointers and row offsets for memory savings */
2284     ui           = a->i;
2285     uj           = a->j;
2286     free_ij      = PETSC_FALSE;
2287     ratio_needed = 1.0;
2288   } else { /* case: levels>0 || (levels=0 && !perm_identity) */
2289     if (perm_identity){
2290       a->permute = PETSC_FALSE;
2291       ai = a->i; aj = a->j;
2292     } else { /*  non-trivial permutation */
2293       a->permute = PETSC_TRUE;
2294       ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr);
2295       ai   = a->inew; aj = a->jnew;
2296     }
2297     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2298 
2299     /* initialization */
2300     ierr  = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
2301     ui[0] = 0;
2302 
2303     /* jl: linked list for storing indices of the pivot rows
2304        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2305     ierr = PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr);
2306     for (i=0; i<am; i++){
2307       jl[i] = am; il[i] = 0;
2308     }
2309 
2310     /* create and initialize a linked list for storing column indices of the active row k */
2311     nlnk = am + 1;
2312     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2313 
2314     /* initial FreeSpace size is fill*(ai[am]+1) */
2315     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
2316     current_space = free_space;
2317     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
2318     current_space_lvl = free_space_lvl;
2319 
2320     for (k=0; k<am; k++){  /* for each active row k */
2321       /* initialize lnk by the column indices of row rip[k] */
2322       nzk   = 0;
2323       ncols = ai[rip[k]+1] - ai[rip[k]];
2324       cols  = aj+ai[rip[k]];
2325       ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2326       nzk += nlnk;
2327 
2328       /* update lnk by computing fill-in for each pivot row to be merged in */
2329       prow = jl[k]; /* 1st pivot row */
2330 
2331       while (prow < k){
2332         nextprow = jl[prow];
2333 
2334         /* merge prow into k-th row */
2335         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
2336         jmax = ui[prow+1];
2337         ncols = jmax-jmin;
2338         i     = jmin - ui[prow];
2339         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2340         j     = *(uj_lvl_ptr[prow] + i - 1);
2341         cols_lvl = uj_lvl_ptr[prow]+i;
2342         ierr = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2343         nzk += nlnk;
2344 
2345         /* update il and jl for prow */
2346         if (jmin < jmax){
2347           il[prow] = jmin;
2348           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2349         }
2350         prow = nextprow;
2351       }
2352 
2353       /* if free space is not available, make more free space */
2354       if (current_space->local_remaining<nzk) {
2355         i = am - k + 1; /* num of unfactored rows */
2356         i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2357         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2358         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2359         reallocs++;
2360       }
2361 
2362       /* copy data into free_space and free_space_lvl, then initialize lnk */
2363       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2364 
2365       /* add the k-th row into il and jl */
2366       if (nzk-1 > 0){
2367         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2368         jl[k] = jl[i]; jl[i] = k;
2369         il[k] = ui[k] + 1;
2370       }
2371       uj_ptr[k]     = current_space->array;
2372       uj_lvl_ptr[k] = current_space_lvl->array;
2373 
2374       current_space->array           += nzk;
2375       current_space->local_used      += nzk;
2376       current_space->local_remaining -= nzk;
2377       current_space_lvl->array           += nzk;
2378       current_space_lvl->local_used      += nzk;
2379       current_space_lvl->local_remaining -= nzk;
2380 
2381       ui[k+1] = ui[k] + nzk;
2382     }
2383 
2384 #if defined(PETSC_USE_INFO)
2385     if (ai[am] != 0) {
2386       PetscReal af = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2387       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
2388       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
2389       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
2390     } else {
2391       ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2392     }
2393 #endif
2394 
2395     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2396     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2397 
2398     /* destroy list of free space and other temporary array(s) */
2399     ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
2400     ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
2401     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2402     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2403     if (ai[am] != 0) {
2404       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2405     } else {
2406       ratio_needed = 0.0;
2407     }
2408   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2409 
2410   /* put together the new matrix in MATSEQSBAIJ format */
2411   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
2412 
2413   b = (Mat_SeqSBAIJ*)(fact)->data;
2414   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2415   b->singlemalloc = PETSC_FALSE;
2416   b->free_a  = PETSC_TRUE;
2417   b->free_ij = free_ij;
2418   ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
2419   b->j    = uj;
2420   b->i    = ui;
2421   b->diag    = 0;
2422   b->ilen    = 0;
2423   b->imax    = 0;
2424   b->row     = perm;
2425   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2426   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2427   b->icol = perm;
2428   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2429   ierr    = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2430   b->maxnz = b->nz = ui[am];
2431 
2432   (fact)->info.factor_mallocs    = reallocs;
2433   (fact)->info.fill_ratio_given  = fill;
2434   (fact)->info.fill_ratio_needed = ratio_needed;
2435   ierr = MatSeqSBAIJSetNumericFactorization(fact,perm_identity);CHKERRQ(ierr);
2436   PetscFunctionReturn(0);
2437 }
2438 
2439