xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision 6abefa4f446b2ea91915da4e25ea0defa83db48e)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for SBAIJ format.
5 */
6 
7 #include "../src/mat/impls/sbaij/seq/sbaij.h"
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_inplace"
13 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
16   IS              isrow=a->row;
17   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j;
18   PetscErrorCode  ierr;
19   const PetscInt  *r;
20   PetscInt        nz,*vj,k,idx,k1;
21   PetscInt        bs=A->rmap->bs,bs2 = a->bs2;
22   MatScalar       *aa=a->a,*v,*diag;
23   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;
24 
25   PetscFunctionBegin;
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28   t  = a->solve_work;
29   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
30   ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr);
31 
32   /* solve U^T * D * y = b by forward substitution */
33   xk = t;
34   for (k=0; k<mbs; k++) { /* t <- perm(b) */
35     idx   = bs*r[k];
36     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
37   }
38   for (k=0; k<mbs; k++){
39     v  = aa + bs2*ai[k];
40     xk = t + k*bs;      /* Dk*xk = k-th block of x */
41     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
42     nz = ai[k+1] - ai[k];
43     vj = aj + ai[k];
44     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
45     while (nz--) {
46       /* x(:) += U(k,:)^T*(Dk*xk) */
47       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
48       vj++; xj = t + (*vj)*bs;
49       v += bs2;
50     }
51     /* xk = inv(Dk)*(Dk*xk) */
52     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
53     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
54   }
55 
56   /* solve U*x = y by back substitution */
57   for (k=mbs-1; k>=0; k--){
58     v  = aa + bs2*ai[k];
59     xk = t + k*bs;        /* xk */
60     nz = ai[k+1] - ai[k];
61     vj = aj + ai[k];
62     xj = t + (*vj)*bs;
63     while (nz--) {
64       /* xk += U(k,:)*x(:) */
65       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
66       vj++;
67       v += bs2; xj = t + (*vj)*bs;
68     }
69     idx   = bs*r[k];
70     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
71   }
72 
73   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
74   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
75   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
76   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
77   ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr);
78   PetscFunctionReturn(0);
79 }
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_inplace"
83 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
84 {
85   PetscFunctionBegin;
86   SETERRQ(1,"not implemented yet");
87   PetscFunctionReturn(0);
88 }
89 
90 #undef __FUNCT__
91 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_inplace"
92 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
93 {
94   PetscFunctionBegin;
95   SETERRQ(1,"not implemented yet");
96   PetscFunctionReturn(0);
97 }
98 
99 #undef __FUNCT__
100 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private"
101 PetscErrorCode ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
102 {
103   PetscErrorCode ierr;
104   PetscInt       nz,*vj,k;
105   PetscInt       bs2 = bs*bs;
106   MatScalar      *v,*diag;
107   PetscScalar    *xk,*xj,*xk_tmp;
108 
109   PetscFunctionBegin;
110   ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr);
111   for (k=0; k<mbs; k++){
112     v  = aa + bs2*ai[k];
113     xk = x + k*bs;      /* Dk*xk = k-th block of x */
114     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
115     nz = ai[k+1] - ai[k];
116     vj = aj + ai[k];
117     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
118     while (nz--) {
119       /* x(:) += U(k,:)^T*(Dk*xk) */
120       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
121       vj++; xj = x + (*vj)*bs;
122       v += bs2;
123     }
124     /* xk = inv(Dk)*(Dk*xk) */
125     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
126     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
127   }
128   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
129   PetscFunctionReturn(0);
130 }
131 
132 #undef __FUNCT__
133 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private"
134 PetscErrorCode BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
135 {
136   PetscInt       nz,*vj,k;
137   PetscInt       bs2 = bs*bs;
138   MatScalar      *v;
139   PetscScalar    *xk,*xj;
140 
141   PetscFunctionBegin;
142   for (k=mbs-1; k>=0; k--){
143     v  = aa + bs2*ai[k];
144     xk = x + k*bs;        /* xk */
145     nz = ai[k+1] - ai[k];
146     vj = aj + ai[k];
147     xj = x + (*vj)*bs;
148     while (nz--) {
149       /* xk += U(k,:)*x(:) */
150       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
151       vj++;
152       v += bs2; xj = x + (*vj)*bs;
153     }
154   }
155   PetscFunctionReturn(0);
156 }
157 
158 #undef __FUNCT__
159 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
160 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
161 {
162   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
163   PetscErrorCode ierr;
164   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
165   PetscInt       bs=A->rmap->bs;
166   MatScalar      *aa=a->a;
167   PetscScalar    *x,*b;
168 
169   PetscFunctionBegin;
170   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
171   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
172 
173   /* solve U^T * D * y = b by forward substitution */
174   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
175   ierr = ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
176 
177   /* solve U*x = y by back substitution */
178   ierr = BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
179 
180   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
181   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
182   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 }
185 
186 #undef __FUNCT__
187 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
188 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
189 {
190   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
191   PetscErrorCode ierr;
192   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
193   PetscInt       bs=A->rmap->bs;
194   MatScalar      *aa=a->a;
195   PetscScalar    *x,*b;
196 
197   PetscFunctionBegin;
198   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
199   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
200   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
201   ierr = ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
202   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
203   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
204   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
205   PetscFunctionReturn(0);
206 }
207 
208 #undef __FUNCT__
209 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
210 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
211 {
212   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
213   PetscErrorCode ierr;
214   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
215   PetscInt       bs=A->rmap->bs;
216   MatScalar      *aa=a->a;
217   PetscScalar    *x,*b;
218 
219   PetscFunctionBegin;
220   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
221   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
222   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
223   ierr = BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
224   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
225   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
226   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 #undef __FUNCT__
231 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_inplace"
232 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
233 {
234   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
235   IS             isrow=a->row;
236   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs;
237   PetscErrorCode ierr;
238   const PetscInt *r;
239   PetscInt       nz,*vj,k,idx;
240   MatScalar      *aa=a->a,*v,*d;
241   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
242 
243   PetscFunctionBegin;
244   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
245   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
246   t  = a->solve_work;
247   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
248 
249   /* solve U^T * D * y = b by forward substitution */
250   tp = t;
251   for (k=0; k<mbs; k++) { /* t <- perm(b) */
252     idx   = 7*r[k];
253     tp[0] = b[idx];
254     tp[1] = b[idx+1];
255     tp[2] = b[idx+2];
256     tp[3] = b[idx+3];
257     tp[4] = b[idx+4];
258     tp[5] = b[idx+5];
259     tp[6] = b[idx+6];
260     tp += 7;
261   }
262 
263   for (k=0; k<mbs; k++){
264     v  = aa + 49*ai[k];
265     vj = aj + ai[k];
266     tp = t + k*7;
267     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
268     nz = ai[k+1] - ai[k];
269     tp = t + (*vj)*7;
270     while (nz--) {
271       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
272       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
273       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
274       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
275       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
276       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
277       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
278       vj++; tp = t + (*vj)*7;
279       v += 49;
280     }
281 
282     /* xk = inv(Dk)*(Dk*xk) */
283     d  = aa+k*49;          /* ptr to inv(Dk) */
284     tp    = t + k*7;
285     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
286     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
287     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
288     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
289     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
290     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
291     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
292   }
293 
294   /* solve U*x = y by back substitution */
295   for (k=mbs-1; k>=0; k--){
296     v  = aa + 49*ai[k];
297     vj = aj + ai[k];
298     tp    = t + k*7;
299     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
300     nz = ai[k+1] - ai[k];
301 
302     tp = t + (*vj)*7;
303     while (nz--) {
304       /* xk += U(k,:)*x(:) */
305       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
306       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
307       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
308       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
309       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
310       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
311       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
312       vj++; tp = t + (*vj)*7;
313       v += 49;
314     }
315     tp    = t + k*7;
316     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
317     idx   = 7*r[k];
318     x[idx]     = x0;
319     x[idx+1]   = x1;
320     x[idx+2]   = x2;
321     x[idx+3]   = x3;
322     x[idx+4]   = x4;
323     x[idx+5]   = x5;
324     x[idx+6]   = x6;
325   }
326 
327   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
328   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
329   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
330   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private"
336 PetscErrorCode ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
337 {
338   MatScalar      *v,*d;
339   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
340   PetscInt       nz,*vj,k;
341 
342   PetscFunctionBegin;
343   for (k=0; k<mbs; k++){
344     v  = aa + 49*ai[k];
345     xp = x + k*7;
346     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
347     nz = ai[k+1] - ai[k];
348     vj = aj + ai[k];
349     xp = x + (*vj)*7;
350     while (nz--) {
351       /* x(:) += U(k,:)^T*(Dk*xk) */
352       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
353       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
354       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
355       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
356       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
357       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
358       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
359       vj++; xp = x + (*vj)*7;
360       v += 49;
361     }
362     /* xk = inv(Dk)*(Dk*xk) */
363     d  = aa+k*49;          /* ptr to inv(Dk) */
364     xp = x + k*7;
365     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
366     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
367     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
368     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
369     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
370     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
371     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
372   }
373   PetscFunctionReturn(0);
374 }
375 
376 #undef __FUNCT__
377 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private"
378 PetscErrorCode BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
379 {
380   MatScalar      *v;
381   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
382   PetscInt       nz,*vj,k;
383 
384   PetscFunctionBegin;
385   for (k=mbs-1; k>=0; k--){
386     v  = aa + 49*ai[k];
387     xp = x + k*7;
388     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
389     nz = ai[k+1] - ai[k];
390     vj = aj + ai[k];
391     xp = x + (*vj)*7;
392     while (nz--) {
393       /* xk += U(k,:)*x(:) */
394       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
395       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
396       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
397       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
398       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
399       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
400       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
401       vj++;
402       v += 49; xp = x + (*vj)*7;
403     }
404     xp = x + k*7;
405     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
406   }
407   PetscFunctionReturn(0);
408 }
409 
410 #undef __FUNCT__
411 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
412 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
413 {
414   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
415   PetscErrorCode ierr;
416   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
417   MatScalar      *aa=a->a;
418   PetscScalar    *x,*b;
419 
420   PetscFunctionBegin;
421   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
422   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
423 
424   /* solve U^T * D * y = b by forward substitution */
425   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
426   ierr = ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
427 
428   /* solve U*x = y by back substitution */
429   ierr = BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
430 
431   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
432   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
433   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
434   PetscFunctionReturn(0);
435 }
436 
437 #undef __FUNCT__
438 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
439 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
440 {
441   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
442   PetscErrorCode ierr;
443   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
444   MatScalar      *aa=a->a;
445   PetscScalar    *x,*b;
446 
447   PetscFunctionBegin;
448   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
449   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
450   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
451   ierr = ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
452   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
453   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
454   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
455   PetscFunctionReturn(0);
456 }
457 
458 #undef __FUNCT__
459 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
460 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
461 {
462   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
463   PetscErrorCode ierr;
464   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
465   MatScalar      *aa=a->a;
466   PetscScalar    *x,*b;
467 
468   PetscFunctionBegin;
469   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
470   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
471   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
472   ierr = BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
473   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
474   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
475   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
476   PetscFunctionReturn(0);
477 }
478 
479 #undef __FUNCT__
480 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_inplace"
481 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
482 {
483   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
484   IS             isrow=a->row;
485   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
486   PetscErrorCode ierr;
487   const PetscInt *r;
488   PetscInt       nz,*vj,k,idx;
489   MatScalar      *aa=a->a,*v,*d;
490   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
491 
492   PetscFunctionBegin;
493   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
494   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
495   t  = a->solve_work;
496   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
497 
498   /* solve U^T * D * y = b by forward substitution */
499   tp = t;
500   for (k=0; k<mbs; k++) { /* t <- perm(b) */
501     idx   = 6*r[k];
502     tp[0] = b[idx];
503     tp[1] = b[idx+1];
504     tp[2] = b[idx+2];
505     tp[3] = b[idx+3];
506     tp[4] = b[idx+4];
507     tp[5] = b[idx+5];
508     tp += 6;
509   }
510 
511   for (k=0; k<mbs; k++){
512     v  = aa + 36*ai[k];
513     vj = aj + ai[k];
514     tp = t + k*6;
515     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
516     nz = ai[k+1] - ai[k];
517     tp = t + (*vj)*6;
518     while (nz--) {
519       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
520       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
521       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
522       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
523       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
524       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
525       vj++; tp = t + (*vj)*6;
526       v += 36;
527     }
528 
529     /* xk = inv(Dk)*(Dk*xk) */
530     d  = aa+k*36;          /* ptr to inv(Dk) */
531     tp    = t + k*6;
532     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
533     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
534     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
535     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
536     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
537     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
538   }
539 
540   /* solve U*x = y by back substitution */
541   for (k=mbs-1; k>=0; k--){
542     v  = aa + 36*ai[k];
543     vj = aj + ai[k];
544     tp    = t + k*6;
545     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
546     nz = ai[k+1] - ai[k];
547 
548     tp = t + (*vj)*6;
549     while (nz--) {
550       /* xk += U(k,:)*x(:) */
551       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
552       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
553       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
554       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
555       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
556       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
557       vj++; tp = t + (*vj)*6;
558       v += 36;
559     }
560     tp    = t + k*6;
561     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
562     idx   = 6*r[k];
563     x[idx]     = x0;
564     x[idx+1]   = x1;
565     x[idx+2]   = x2;
566     x[idx+3]   = x3;
567     x[idx+4]   = x4;
568     x[idx+5]   = x5;
569   }
570 
571   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
572   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
573   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
574   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
575   PetscFunctionReturn(0);
576 }
577 
578 #undef __FUNCT__
579 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private"
580 PetscErrorCode ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
581 {
582   MatScalar      *v,*d;
583   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
584   PetscInt       nz,*vj,k;
585 
586   PetscFunctionBegin;
587   for (k=0; k<mbs; k++){
588     v  = aa + 36*ai[k];
589     xp = x + k*6;
590     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
591     nz = ai[k+1] - ai[k];
592     vj = aj + ai[k];
593     xp = x + (*vj)*6;
594     while (nz--) {
595       /* x(:) += U(k,:)^T*(Dk*xk) */
596       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
597       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
598       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
599       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
600       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
601       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
602       vj++; xp = x + (*vj)*6;
603       v += 36;
604     }
605     /* xk = inv(Dk)*(Dk*xk) */
606     d  = aa+k*36;          /* ptr to inv(Dk) */
607     xp = x + k*6;
608     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
609     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
610     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
611     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
612     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
613     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
614   }
615   PetscFunctionReturn(0);
616 }
617 #undef __FUNCT__
618 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private"
619 PetscErrorCode BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
620 {
621   MatScalar      *v;
622   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
623   PetscInt       nz,*vj,k;
624 
625   PetscFunctionBegin;
626   for (k=mbs-1; k>=0; k--){
627     v  = aa + 36*ai[k];
628     xp = x + k*6;
629     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
630     nz = ai[k+1] - ai[k];
631     vj = aj + ai[k];
632     xp = x + (*vj)*6;
633     while (nz--) {
634       /* xk += U(k,:)*x(:) */
635       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
636       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
637       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
638       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
639       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
640       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
641       vj++;
642       v += 36; xp = x + (*vj)*6;
643     }
644     xp = x + k*6;
645     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
646   }
647   PetscFunctionReturn(0);
648 }
649 
650 
651 #undef __FUNCT__
652 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
653 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
654 {
655   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
656   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
657   MatScalar      *aa=a->a;
658   PetscScalar    *x,*b;
659   PetscErrorCode ierr;
660 
661   PetscFunctionBegin;
662   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
663   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
664 
665   /* solve U^T * D * y = b by forward substitution */
666   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
667   ierr = ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
668 
669   /* solve U*x = y by back substitution */
670   ierr = BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
671 
672   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
673   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
674   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
675   PetscFunctionReturn(0);
676 }
677 
678 #undef __FUNCT__
679 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
680 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
681 {
682   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
683   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
684   MatScalar      *aa=a->a;
685   PetscScalar    *x,*b;
686   PetscErrorCode ierr;
687 
688   PetscFunctionBegin;
689   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
690   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
691   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
692   ierr = ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
693   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
694   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
695   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
696   PetscFunctionReturn(0);
697 }
698 
699 #undef __FUNCT__
700 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
701 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
702 {
703   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
704   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
705   MatScalar      *aa=a->a;
706   PetscScalar    *x,*b;
707   PetscErrorCode ierr;
708 
709   PetscFunctionBegin;
710   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
711   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
712   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
713   ierr = BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
714   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
715   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
716   ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr);
717   PetscFunctionReturn(0);
718 }
719 
720 #undef __FUNCT__
721 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_inplace"
722 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
723 {
724   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
725   IS             isrow=a->row;
726   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
727   PetscErrorCode ierr;
728   const PetscInt *r;
729   PetscInt       nz,*vj,k,idx;
730   MatScalar      *aa=a->a,*v,*diag;
731   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;
732 
733   PetscFunctionBegin;
734   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
735   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
736   t  = a->solve_work;
737   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
738 
739   /* solve U^T * D * y = b by forward substitution */
740   tp = t;
741   for (k=0; k<mbs; k++) { /* t <- perm(b) */
742     idx   = 5*r[k];
743     tp[0] = b[idx];
744     tp[1] = b[idx+1];
745     tp[2] = b[idx+2];
746     tp[3] = b[idx+3];
747     tp[4] = b[idx+4];
748     tp += 5;
749   }
750 
751   for (k=0; k<mbs; k++){
752     v  = aa + 25*ai[k];
753     vj = aj + ai[k];
754     tp = t + k*5;
755     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
756     nz = ai[k+1] - ai[k];
757 
758     tp = t + (*vj)*5;
759     while (nz--) {
760       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
761       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
762       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
763       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
764       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
765       vj++; tp = t + (*vj)*5;
766       v += 25;
767     }
768 
769     /* xk = inv(Dk)*(Dk*xk) */
770     diag  = aa+k*25;          /* ptr to inv(Dk) */
771     tp    = t + k*5;
772       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
773       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
774       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
775       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
776       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
777   }
778 
779   /* solve U*x = y by back substitution */
780   for (k=mbs-1; k>=0; k--){
781     v  = aa + 25*ai[k];
782     vj = aj + ai[k];
783     tp    = t + k*5;
784     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
785     nz = ai[k+1] - ai[k];
786 
787     tp = t + (*vj)*5;
788     while (nz--) {
789       /* xk += U(k,:)*x(:) */
790       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
791       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
792       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
793       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
794       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
795       vj++; tp = t + (*vj)*5;
796       v += 25;
797     }
798     tp    = t + k*5;
799     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
800     idx   = 5*r[k];
801     x[idx]     = x0;
802     x[idx+1]   = x1;
803     x[idx+2]   = x2;
804     x[idx+3]   = x3;
805     x[idx+4]   = x4;
806   }
807 
808   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
809   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
810   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
811   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
812   PetscFunctionReturn(0);
813 }
814 
815 #undef __FUNCT__
816 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private"
817 PetscErrorCode ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
818 {
819   MatScalar      *v,*diag;
820   PetscScalar    *xp,x0,x1,x2,x3,x4;
821   PetscInt       nz,*vj,k;
822 
823   PetscFunctionBegin;
824   for (k=0; k<mbs; k++){
825     v  = aa + 25*ai[k];
826     xp = x + k*5;
827     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
828     nz = ai[k+1] - ai[k];
829     vj = aj + ai[k];
830     xp = x + (*vj)*5;
831     while (nz--) {
832       /* x(:) += U(k,:)^T*(Dk*xk) */
833       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
834       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
835       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
836       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
837       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
838       vj++; xp = x + (*vj)*5;
839       v += 25;
840     }
841     /* xk = inv(Dk)*(Dk*xk) */
842     diag = aa+k*25;          /* ptr to inv(Dk) */
843     xp   = x + k*5;
844     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
845     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
846     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
847     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
848     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
849   }
850   PetscFunctionReturn(0);
851 }
852 
853 #undef __FUNCT__
854 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private"
855 PetscErrorCode BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
856 {
857   MatScalar      *v;
858   PetscScalar    *xp,x0,x1,x2,x3,x4;
859   PetscInt       nz,*vj,k;
860 
861   PetscFunctionBegin;
862   for (k=mbs-1; k>=0; k--){
863     v  = aa + 25*ai[k];
864     xp = x + k*5;
865     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
866     nz = ai[k+1] - ai[k];
867     vj = aj + ai[k];
868     xp = x + (*vj)*5;
869     while (nz--) {
870       /* xk += U(k,:)*x(:) */
871       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
872       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
873       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
874       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
875       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
876       vj++;
877       v += 25; xp = x + (*vj)*5;
878     }
879     xp = x + k*5;
880     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
881   }
882   PetscFunctionReturn(0);
883 }
884 
885 #undef __FUNCT__
886 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
887 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
888 {
889   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
890   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
891   MatScalar      *aa=a->a;
892   PetscScalar    *x,*b;
893   PetscErrorCode ierr;
894 
895   PetscFunctionBegin;
896   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
897   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
898 
899   /* solve U^T * D * y = b by forward substitution */
900   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
901   ierr = ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
902 
903   /* solve U*x = y by back substitution */
904   ierr = BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
905 
906   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
907   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
908   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
909   PetscFunctionReturn(0);
910 }
911 
912 #undef __FUNCT__
913 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
914 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
915 {
916   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
917   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
918   MatScalar      *aa=a->a;
919   PetscScalar    *x,*b;
920   PetscErrorCode ierr;
921 
922   PetscFunctionBegin;
923   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
924   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
925   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
926   ierr = ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
927   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
928   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
929   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
930   PetscFunctionReturn(0);
931 }
932 
933 #undef __FUNCT__
934 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
935 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
936 {
937   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
938   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
939   MatScalar      *aa=a->a;
940   PetscScalar    *x,*b;
941   PetscErrorCode ierr;
942 
943   PetscFunctionBegin;
944   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
945   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
946   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
947   ierr = BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
948   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
949   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
950   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
951   PetscFunctionReturn(0);
952 }
953 
954 #undef __FUNCT__
955 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_inplace"
956 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
957 {
958   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
959   IS             isrow=a->row;
960   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
961   PetscErrorCode ierr;
962   const PetscInt *r;
963   PetscInt       nz,*vj,k,idx;
964   MatScalar      *aa=a->a,*v,*diag;
965   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;
966 
967   PetscFunctionBegin;
968   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
969   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
970   t  = a->solve_work;
971   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
972 
973   /* solve U^T * D * y = b by forward substitution */
974   tp = t;
975   for (k=0; k<mbs; k++) { /* t <- perm(b) */
976     idx   = 4*r[k];
977     tp[0] = b[idx];
978     tp[1] = b[idx+1];
979     tp[2] = b[idx+2];
980     tp[3] = b[idx+3];
981     tp += 4;
982   }
983 
984   for (k=0; k<mbs; k++){
985     v  = aa + 16*ai[k];
986     vj = aj + ai[k];
987     tp = t + k*4;
988     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
989     nz = ai[k+1] - ai[k];
990 
991     tp = t + (*vj)*4;
992     while (nz--) {
993       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
994       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
995       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
996       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
997       vj++; tp = t + (*vj)*4;
998       v += 16;
999     }
1000 
1001     /* xk = inv(Dk)*(Dk*xk) */
1002     diag  = aa+k*16;          /* ptr to inv(Dk) */
1003     tp    = t + k*4;
1004     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1005     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1006     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1007     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1008   }
1009 
1010   /* solve U*x = y by back substitution */
1011   for (k=mbs-1; k>=0; k--){
1012     v  = aa + 16*ai[k];
1013     vj = aj + ai[k];
1014     tp    = t + k*4;
1015     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1016     nz = ai[k+1] - ai[k];
1017 
1018     tp = t + (*vj)*4;
1019     while (nz--) {
1020       /* xk += U(k,:)*x(:) */
1021       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1022       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1023       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1024       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1025       vj++; tp = t + (*vj)*4;
1026       v += 16;
1027     }
1028     tp    = t + k*4;
1029     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1030     idx        = 4*r[k];
1031     x[idx]     = x0;
1032     x[idx+1]   = x1;
1033     x[idx+2]   = x2;
1034     x[idx+3]   = x3;
1035   }
1036 
1037   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1038   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1039   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1040   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1041    PetscFunctionReturn(0);
1042 }
1043 
1044 #undef __FUNCT__
1045 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private"
1046 PetscErrorCode ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1047 {
1048   MatScalar      *v,*diag;
1049   PetscScalar    *xp,x0,x1,x2,x3;
1050   PetscInt       nz,*vj,k;
1051 
1052   PetscFunctionBegin;
1053   for (k=0; k<mbs; k++){
1054     v  = aa + 16*ai[k];
1055     xp = x + k*4;
1056     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1057     nz = ai[k+1] - ai[k];
1058     vj = aj + ai[k];
1059     xp = x + (*vj)*4;
1060     while (nz--) {
1061       /* x(:) += U(k,:)^T*(Dk*xk) */
1062       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1063       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1064       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1065       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1066       vj++; xp = x + (*vj)*4;
1067       v += 16;
1068     }
1069     /* xk = inv(Dk)*(Dk*xk) */
1070     diag = aa+k*16;          /* ptr to inv(Dk) */
1071     xp   = x + k*4;
1072     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1073     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1074     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1075     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1076   }
1077   PetscFunctionReturn(0);
1078 }
1079 
1080 #undef __FUNCT__
1081 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private"
1082 PetscErrorCode BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1083 {
1084   MatScalar      *v;
1085   PetscScalar    *xp,x0,x1,x2,x3;
1086   PetscInt       nz,*vj,k;
1087 
1088   PetscFunctionBegin;
1089   for (k=mbs-1; k>=0; k--){
1090     v  = aa + 16*ai[k];
1091     xp = x + k*4;
1092     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1093     nz = ai[k+1] - ai[k];
1094     vj = aj + ai[k];
1095     xp = x + (*vj)*4;
1096     while (nz--) {
1097       /* xk += U(k,:)*x(:) */
1098       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1099       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1100       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1101       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1102       vj++;
1103       v += 16; xp = x + (*vj)*4;
1104     }
1105     xp = x + k*4;
1106     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1107   }
1108   PetscFunctionReturn(0);
1109 }
1110 
1111 #undef __FUNCT__
1112 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1113 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1114 {
1115   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1116   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1117   MatScalar      *aa=a->a;
1118   PetscScalar    *x,*b;
1119   PetscErrorCode ierr;
1120 
1121   PetscFunctionBegin;
1122   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1123   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1124 
1125   /* solve U^T * D * y = b by forward substitution */
1126   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
1127   ierr = ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1128 
1129   /* solve U*x = y by back substitution */
1130   ierr = BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1131   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1132   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1133   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1134   PetscFunctionReturn(0);
1135 }
1136 
1137 #undef __FUNCT__
1138 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1139 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1140 {
1141   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1142   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1143   MatScalar      *aa=a->a;
1144   PetscScalar    *x,*b;
1145   PetscErrorCode ierr;
1146 
1147   PetscFunctionBegin;
1148   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1149   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1150   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
1151   ierr = ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1152   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1153   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1154   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1155   PetscFunctionReturn(0);
1156 }
1157 
1158 #undef __FUNCT__
1159 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1160 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1161 {
1162   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1163   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1164   MatScalar      *aa=a->a;
1165   PetscScalar    *x,*b;
1166   PetscErrorCode ierr;
1167 
1168   PetscFunctionBegin;
1169   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1170   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1171   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1172   ierr = BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1173   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1174   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1175   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
1176   PetscFunctionReturn(0);
1177 }
1178 
1179 #undef __FUNCT__
1180 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_inplace"
1181 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1182 {
1183   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1184   IS             isrow=a->row;
1185   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1186   PetscErrorCode ierr;
1187   const PetscInt *r;
1188   PetscInt       nz,*vj,k,idx;
1189   MatScalar      *aa=a->a,*v,*diag;
1190   PetscScalar    *x,*b,x0,x1,x2,*t,*tp;
1191 
1192   PetscFunctionBegin;
1193   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1194   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1195   t  = a->solve_work;
1196   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1197 
1198   /* solve U^T * D * y = b by forward substitution */
1199   tp = t;
1200   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1201     idx   = 3*r[k];
1202     tp[0] = b[idx];
1203     tp[1] = b[idx+1];
1204     tp[2] = b[idx+2];
1205     tp += 3;
1206   }
1207 
1208   for (k=0; k<mbs; k++){
1209     v  = aa + 9*ai[k];
1210     vj = aj + ai[k];
1211     tp = t + k*3;
1212     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1213     nz = ai[k+1] - ai[k];
1214 
1215     tp = t + (*vj)*3;
1216     while (nz--) {
1217       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1218       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1219       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1220       vj++; tp = t + (*vj)*3;
1221       v += 9;
1222     }
1223 
1224     /* xk = inv(Dk)*(Dk*xk) */
1225     diag  = aa+k*9;          /* ptr to inv(Dk) */
1226     tp    = t + k*3;
1227     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1228     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1229     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1230   }
1231 
1232   /* solve U*x = y by back substitution */
1233   for (k=mbs-1; k>=0; k--){
1234     v  = aa + 9*ai[k];
1235     vj = aj + ai[k];
1236     tp    = t + k*3;
1237     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1238     nz = ai[k+1] - ai[k];
1239 
1240     tp = t + (*vj)*3;
1241     while (nz--) {
1242       /* xk += U(k,:)*x(:) */
1243       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1244       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1245       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1246       vj++; tp = t + (*vj)*3;
1247       v += 9;
1248     }
1249     tp    = t + k*3;
1250     tp[0] = x0; tp[1] = x1; tp[2] = x2;
1251     idx      = 3*r[k];
1252     x[idx]   = x0;
1253     x[idx+1] = x1;
1254     x[idx+2] = x2;
1255   }
1256 
1257   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1258   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1259   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1260   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1261   PetscFunctionReturn(0);
1262 }
1263 
1264 #undef __FUNCT__
1265 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private"
1266 PetscErrorCode ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1267 {
1268   MatScalar      *v,*diag;
1269   PetscScalar    *xp,x0,x1,x2;
1270   PetscInt       nz,*vj,k;
1271 
1272   PetscFunctionBegin;
1273   for (k=0; k<mbs; k++){
1274     v  = aa + 9*ai[k];
1275     xp = x + k*3;
1276     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1277     nz = ai[k+1] - ai[k];
1278     vj = aj + ai[k];
1279     xp = x + (*vj)*3;
1280     while (nz--) {
1281       /* x(:) += U(k,:)^T*(Dk*xk) */
1282       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1283       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1284       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1285       vj++; xp = x + (*vj)*3;
1286       v += 9;
1287     }
1288     /* xk = inv(Dk)*(Dk*xk) */
1289     diag = aa+k*9;          /* ptr to inv(Dk) */
1290     xp   = x + k*3;
1291     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1292     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1293     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1294   }
1295   PetscFunctionReturn(0);
1296 }
1297 
1298 #undef __FUNCT__
1299 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private"
1300 PetscErrorCode BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1301 {
1302   MatScalar      *v;
1303   PetscScalar    *xp,x0,x1,x2;
1304   PetscInt       nz,*vj,k;
1305 
1306   PetscFunctionBegin;
1307   for (k=mbs-1; k>=0; k--){
1308     v  = aa + 9*ai[k];
1309     xp = x + k*3;
1310     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1311     nz = ai[k+1] - ai[k];
1312     vj = aj + ai[k];
1313     xp = x + (*vj)*3;
1314     while (nz--) {
1315       /* xk += U(k,:)*x(:) */
1316       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1317       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1318       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1319       vj++;
1320       v += 9; xp = x + (*vj)*3;
1321     }
1322     xp = x + k*3;
1323     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1324   }
1325   PetscFunctionReturn(0);
1326 }
1327 
1328 #undef __FUNCT__
1329 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1330 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1331 {
1332   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1333   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1334   MatScalar      *aa=a->a;
1335   PetscScalar    *x,*b;
1336   PetscErrorCode ierr;
1337 
1338   PetscFunctionBegin;
1339   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1340   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1341 
1342   /* solve U^T * D * y = b by forward substitution */
1343   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1344   ierr = ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1345 
1346   /* solve U*x = y by back substitution */
1347   ierr = BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1348 
1349   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1350   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1351   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1352   PetscFunctionReturn(0);
1353 }
1354 
1355 #undef __FUNCT__
1356 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1357 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1358 {
1359   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1360   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1361   MatScalar      *aa=a->a;
1362   PetscScalar    *x,*b;
1363   PetscErrorCode ierr;
1364 
1365   PetscFunctionBegin;
1366   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1367   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1368   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1369   ierr = ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1370   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1371   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1372   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1373   PetscFunctionReturn(0);
1374 }
1375 
1376 #undef __FUNCT__
1377 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1378 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1379 {
1380   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1381   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1382   MatScalar      *aa=a->a;
1383   PetscScalar    *x,*b;
1384   PetscErrorCode ierr;
1385 
1386   PetscFunctionBegin;
1387   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1388   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1389   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1390   ierr = BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1391   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1392   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1393   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
1394   PetscFunctionReturn(0);
1395 }
1396 
1397 #undef __FUNCT__
1398 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_inplace"
1399 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1400 {
1401   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ *)A->data;
1402   IS             isrow=a->row;
1403   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1404   PetscErrorCode ierr;
1405   const PetscInt *r;
1406   PetscInt       nz,*vj,k,k2,idx;
1407   MatScalar      *aa=a->a,*v,*diag;
1408   PetscScalar    *x,*b,x0,x1,*t;
1409 
1410   PetscFunctionBegin;
1411   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1412   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1413   t  = a->solve_work;
1414   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1415 
1416   /* solve U^T * D * y = perm(b) by forward substitution */
1417   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1418     idx = 2*r[k];
1419     t[k*2]   = b[idx];
1420     t[k*2+1] = b[idx+1];
1421   }
1422   for (k=0; k<mbs; k++){
1423     v  = aa + 4*ai[k];
1424     vj = aj + ai[k];
1425     k2 = k*2;
1426     x0 = t[k2]; x1 = t[k2+1];
1427     nz = ai[k+1] - ai[k];
1428     while (nz--) {
1429       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1430       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1431       vj++; v += 4;
1432     }
1433     diag = aa+k*4;  /* ptr to inv(Dk) */
1434     t[k2]   = diag[0]*x0 + diag[2]*x1;
1435     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1436   }
1437 
1438   /* solve U*x = y by back substitution */
1439   for (k=mbs-1; k>=0; k--){
1440     v  = aa + 4*ai[k];
1441     vj = aj + ai[k];
1442     k2 = k*2;
1443     x0 = t[k2]; x1 = t[k2+1];
1444     nz = ai[k+1] - ai[k];
1445     while (nz--) {
1446       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1447       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1448       vj++; v += 4;
1449     }
1450     t[k2]    = x0;
1451     t[k2+1]  = x1;
1452     idx      = 2*r[k];
1453     x[idx]   = x0;
1454     x[idx+1] = x1;
1455   }
1456 
1457   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1458   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1459   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1460   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1461   PetscFunctionReturn(0);
1462 }
1463 
1464 #undef __FUNCT__
1465 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private"
1466 PetscErrorCode ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1467 {
1468   MatScalar      *v,*diag;
1469   PetscScalar    x0,x1;
1470   PetscInt       nz,*vj,k,k2;
1471 
1472   PetscFunctionBegin;
1473   for (k=0; k<mbs; k++){
1474     v  = aa + 4*ai[k];
1475     vj = aj + ai[k];
1476     k2 = k*2;
1477     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1478     nz = ai[k+1] - ai[k];
1479 
1480     while (nz--) {
1481       /* x(:) += U(k,:)^T*(Dk*xk) */
1482       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1483       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1484       vj++; v += 4;
1485     }
1486     /* xk = inv(Dk)*(Dk*xk) */
1487     diag = aa+k*4;          /* ptr to inv(Dk) */
1488     x[k2]   = diag[0]*x0 + diag[2]*x1;
1489     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1490   }
1491   PetscFunctionReturn(0);
1492 }
1493 
1494 #undef __FUNCT__
1495 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private"
1496 PetscErrorCode BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1497 {
1498   MatScalar      *v;
1499   PetscScalar    x0,x1;
1500   PetscInt       nz,*vj,k,k2;
1501 
1502   PetscFunctionBegin;
1503   for (k=mbs-1; k>=0; k--){
1504     v  = aa + 4*ai[k];
1505     vj = aj + ai[k];
1506     k2 = k*2;
1507     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1508     nz = ai[k+1] - ai[k];
1509     while (nz--) {
1510       /* xk += U(k,:)*x(:) */
1511       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1512       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1513       vj++; v += 4;
1514     }
1515     x[k2]     = x0;
1516     x[k2+1]   = x1;
1517   }
1518   PetscFunctionReturn(0);
1519 }
1520 
1521 #undef __FUNCT__
1522 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1523 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1524 {
1525   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1526   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1527   MatScalar      *aa=a->a;
1528   PetscScalar    *x,*b;
1529   PetscErrorCode ierr;
1530 
1531   PetscFunctionBegin;
1532   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1533   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1534 
1535   /* solve U^T * D * y = b by forward substitution */
1536   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1537   ierr = ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1538 
1539   /* solve U*x = y by back substitution */
1540   ierr = BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1541 
1542   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1543   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1544   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1545   PetscFunctionReturn(0);
1546 }
1547 
1548 #undef __FUNCT__
1549 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1550 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1551 {
1552   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1553   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1554   MatScalar      *aa=a->a;
1555   PetscScalar    *x,*b;
1556   PetscErrorCode ierr;
1557 
1558   PetscFunctionBegin;
1559   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1560   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1561   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1562   ierr = ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1563   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1564   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1565   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1566   PetscFunctionReturn(0);
1567 }
1568 
1569 #undef __FUNCT__
1570 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1571 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1572 {
1573   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1574   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1575   MatScalar      *aa=a->a;
1576   PetscScalar    *x,*b;
1577   PetscErrorCode ierr;
1578 
1579   PetscFunctionBegin;
1580   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1581   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1582   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1583   ierr = BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1584   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1585   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1586   ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr);
1587   PetscFunctionReturn(0);
1588 }
1589 
1590 #undef __FUNCT__
1591 #define __FUNCT__ "MatSolve_SeqSBAIJ_1"
1592 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1593 {
1594   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1595   IS                isrow=a->row;
1596   PetscErrorCode    ierr;
1597   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1598   const MatScalar   *aa=a->a,*v;
1599   const PetscScalar *b;
1600   PetscScalar       *x,xk,*t;
1601   PetscInt          nz,k,j;
1602 
1603   PetscFunctionBegin;
1604   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1605   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1606   t    = a->solve_work;
1607   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1608 
1609   /* solve U^T*D*y = perm(b) by forward substitution */
1610   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1611   for (k=0; k<mbs; k++){
1612     v  = aa + ai[k];
1613     vj = aj + ai[k];
1614     xk = t[k];
1615     nz = ai[k+1] - ai[k] - 1;
1616     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1617     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1618   }
1619 
1620   /* solve U*perm(x) = y by back substitution */
1621   for (k=mbs-1; k>=0; k--){
1622     v  = aa + adiag[k] - 1;
1623     vj = aj + adiag[k] - 1;
1624     nz = ai[k+1] - ai[k] - 1;
1625     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1626     x[rp[k]] = t[k];
1627   }
1628 
1629   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1630   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1631   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1632   ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr);
1633   PetscFunctionReturn(0);
1634 }
1635 
1636 #undef __FUNCT__
1637 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_inplace"
1638 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1639 {
1640   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1641   IS              isrow=a->row;
1642   PetscErrorCode  ierr;
1643   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1644   const MatScalar *aa=a->a,*v;
1645   PetscScalar     *x,*b,xk,*t;
1646   PetscInt        nz,k;
1647 
1648   PetscFunctionBegin;
1649   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1650   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1651   t    = a->solve_work;
1652   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1653 
1654   /* solve U^T*D*y = perm(b) by forward substitution */
1655   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1656   for (k=0; k<mbs; k++){
1657     v  = aa + ai[k] + 1;
1658     vj = aj + ai[k] + 1;
1659     xk = t[k];
1660     nz = ai[k+1] - ai[k] - 1;
1661     while (nz--) t[*vj++] += (*v++) * xk;
1662     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1663   }
1664 
1665   /* solve U*perm(x) = y by back substitution */
1666   for (k=mbs-1; k>=0; k--){
1667     v  = aa + ai[k] + 1;
1668     vj = aj + ai[k] + 1;
1669     nz = ai[k+1] - ai[k] - 1;
1670     while (nz--) t[k] += (*v++) * t[*vj++];
1671     x[rp[k]] = t[k];
1672   }
1673 
1674   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1675   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1676   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1677   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
1678   PetscFunctionReturn(0);
1679 }
1680 
1681 #undef __FUNCT__
1682 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1"
1683 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1684 {
1685   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1686   IS              isrow=a->row;
1687   PetscErrorCode  ierr;
1688   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1689   const MatScalar *aa=a->a,*v;
1690   PetscReal       diagk;
1691   PetscScalar     *x,*b,xk;
1692   PetscInt        nz,k;
1693 
1694   PetscFunctionBegin;
1695   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1696   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1697   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1698   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1699 
1700   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1701   for (k=0; k<mbs; k++){
1702     v  = aa + ai[k];
1703     vj = aj + ai[k];
1704     xk = x[k];
1705     nz = ai[k+1] - ai[k] - 1;
1706     while (nz--) x[*vj++] += (*v++) * xk;
1707 
1708     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1709     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1710     x[k] = xk*sqrt(diagk);
1711   }
1712   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1713   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1714   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1715   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1716   PetscFunctionReturn(0);
1717 }
1718 
1719 #undef __FUNCT__
1720 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_inplace"
1721 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1722 {
1723   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1724   IS              isrow=a->row;
1725   PetscErrorCode  ierr;
1726   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1727   const MatScalar *aa=a->a,*v;
1728   PetscReal       diagk;
1729   PetscScalar     *x,*b,xk;
1730   PetscInt        nz,k;
1731 
1732   PetscFunctionBegin;
1733   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1734   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1735   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1736   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1737 
1738   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1739   for (k=0; k<mbs; k++){
1740     v  = aa + ai[k] + 1;
1741     vj = aj + ai[k] + 1;
1742     xk = x[k];
1743     nz = ai[k+1] - ai[k] - 1;
1744     while (nz--) x[*vj++] += (*v++) * xk;
1745 
1746     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1747     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1748     x[k] = xk*sqrt(diagk);
1749   }
1750   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1751   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1752   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1753   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1754   PetscFunctionReturn(0);
1755 }
1756 
1757 #undef __FUNCT__
1758 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1"
1759 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1760 {
1761   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1762   IS              isrow=a->row;
1763   PetscErrorCode  ierr;
1764   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1765   const MatScalar *aa=a->a,*v;
1766   PetscReal       diagk;
1767   PetscScalar     *x,*b,*t;
1768   PetscInt        nz,k;
1769 
1770   PetscFunctionBegin;
1771   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1772   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1773   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1774   t    = a->solve_work;
1775   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1776 
1777   for (k=mbs-1; k>=0; k--){
1778     v  = aa + ai[k];
1779     vj = aj + ai[k];
1780     diagk = PetscRealPart(aa[adiag[k]]);
1781     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1782     t[k] = b[k] * sqrt(diagk);
1783     nz = ai[k+1] - ai[k] - 1;
1784     while (nz--) t[k] += (*v++) * t[*vj++];
1785     x[rp[k]] = t[k];
1786   }
1787   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1788   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1789   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1790   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1791   PetscFunctionReturn(0);
1792 }
1793 
1794 #undef __FUNCT__
1795 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_inplace"
1796 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1797 {
1798   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1799   IS              isrow=a->row;
1800   PetscErrorCode  ierr;
1801   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1802   const MatScalar *aa=a->a,*v;
1803   PetscReal       diagk;
1804   PetscScalar     *x,*b,*t;
1805   PetscInt        nz,k;
1806 
1807   PetscFunctionBegin;
1808   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1809   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1810   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1811   t    = a->solve_work;
1812   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1813 
1814   for (k=mbs-1; k>=0; k--){
1815     v  = aa + ai[k] + 1;
1816     vj = aj + ai[k] + 1;
1817     diagk = PetscRealPart(aa[ai[k]]);
1818     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1819     t[k] = b[k] * sqrt(diagk);
1820     nz = ai[k+1] - ai[k] - 1;
1821     while (nz--) t[k] += (*v++) * t[*vj++];
1822     x[rp[k]] = t[k];
1823   }
1824   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1825   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1826   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1827   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1828   PetscFunctionReturn(0);
1829 }
1830 
1831 #undef __FUNCT__
1832 #define __FUNCT__ "MatSolves_SeqSBAIJ_1"
1833 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1834 {
1835   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1836   PetscErrorCode ierr;
1837 
1838   PetscFunctionBegin;
1839   if (A->rmap->bs == 1) {
1840     ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr);
1841   } else {
1842     IS              isrow=a->row;
1843     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1844     const MatScalar *aa=a->a,*v;
1845     PetscScalar     *x,*b,*t;
1846     PetscInt        nz,k,n,i,j;
1847     if (bb->n > a->solves_work_n) {
1848       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1849       ierr = PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);CHKERRQ(ierr);
1850       a->solves_work_n = bb->n;
1851     }
1852     n    = bb->n;
1853     ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr);
1854     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1855     t    = a->solves_work;
1856 
1857     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1858 
1859     /* solve U^T*D*y = perm(b) by forward substitution */
1860     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1861     for (k=0; k<mbs; k++){
1862       v  = aa + ai[k];
1863       vj = aj + ai[k];
1864       nz = ai[k+1] - ai[k] - 1;
1865       for (j=0; j<nz; j++){
1866         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1867         v++;vj++;
1868       }
1869       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1870     }
1871 
1872     /* solve U*perm(x) = y by back substitution */
1873     for (k=mbs-1; k>=0; k--){
1874       v  = aa + ai[k] - 1;
1875       vj = aj + ai[k] - 1;
1876       nz = ai[k+1] - ai[k] - 1;
1877       for (j=0; j<nz; j++){
1878         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1879         v++;vj++;
1880       }
1881       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1882     }
1883 
1884     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1885     ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr);
1886     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1887     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1888   }
1889   PetscFunctionReturn(0);
1890 }
1891 
1892 #undef __FUNCT__
1893 #define __FUNCT__ "MatSolves_SeqSBAIJ_1_inplace"
1894 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1895 {
1896   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1897   PetscErrorCode ierr;
1898 
1899   PetscFunctionBegin;
1900   if (A->rmap->bs == 1) {
1901     ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr);
1902   } else {
1903     IS              isrow=a->row;
1904     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1905     const MatScalar *aa=a->a,*v;
1906     PetscScalar     *x,*b,*t;
1907     PetscInt        nz,k,n,i;
1908     if (bb->n > a->solves_work_n) {
1909       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1910       ierr = PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);CHKERRQ(ierr);
1911       a->solves_work_n = bb->n;
1912     }
1913     n    = bb->n;
1914     ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr);
1915     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1916     t    = a->solves_work;
1917 
1918     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1919 
1920     /* solve U^T*D*y = perm(b) by forward substitution */
1921     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1922     for (k=0; k<mbs; k++){
1923       v  = aa + ai[k];
1924       vj = aj + ai[k];
1925       nz = ai[k+1] - ai[k];
1926       while (nz--) {
1927         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1928         v++;vj++;
1929       }
1930       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1931     }
1932 
1933     /* solve U*perm(x) = y by back substitution */
1934     for (k=mbs-1; k>=0; k--){
1935       v  = aa + ai[k];
1936       vj = aj + ai[k];
1937       nz = ai[k+1] - ai[k];
1938       while (nz--) {
1939         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1940         v++;vj++;
1941       }
1942       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1943     }
1944 
1945     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1946     ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr);
1947     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1948     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1949   }
1950   PetscFunctionReturn(0);
1951 }
1952 
1953 #undef __FUNCT__
1954 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering"
1955 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1956 {
1957   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1958   PetscErrorCode    ierr;
1959   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
1960   const MatScalar   *aa=a->a,*v;
1961   const PetscScalar *b;
1962   PetscScalar       *x,xi;
1963   PetscInt          nz,i,j;
1964 
1965   PetscFunctionBegin;
1966   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1967   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1968 
1969   /* solve U^T*D*y = b by forward substitution */
1970   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1971   for (i=0; i<mbs; i++){
1972     v  = aa + ai[i];
1973     vj = aj + ai[i];
1974     xi = x[i];
1975     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1976     for (j=0; j<nz; j++){
1977       x[vj[j]] += v[j]* xi;
1978     }
1979     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
1980   }
1981 
1982   /* solve U*x = y by backward substitution */
1983   for (i=mbs-2; i>=0; i--){
1984     xi = x[i];
1985     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
1986     vj = aj + adiag[i] - 1;
1987     nz = ai[i+1] - ai[i] - 1;
1988     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
1989     x[i] = xi;
1990   }
1991 
1992   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1993   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1994   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
1995   PetscFunctionReturn(0);
1996 }
1997 
1998 #undef __FUNCT__
1999 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2000 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2001 {
2002   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2003   PetscErrorCode ierr;
2004   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2005   MatScalar      *aa=a->a,*v;
2006   PetscScalar    *x,*b,xk;
2007   PetscInt       nz,*vj,k;
2008 
2009   PetscFunctionBegin;
2010   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2011   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2012 
2013   /* solve U^T*D*y = b by forward substitution */
2014   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2015   for (k=0; k<mbs; k++){
2016     v  = aa + ai[k] + 1;
2017     vj = aj + ai[k] + 1;
2018     xk = x[k];
2019     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2020     while (nz--) x[*vj++] += (*v++) * xk;
2021     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2022   }
2023 
2024   /* solve U*x = y by back substitution */
2025   for (k=mbs-2; k>=0; k--){
2026     v  = aa + ai[k] + 1;
2027     vj = aj + ai[k] + 1;
2028     xk = x[k];
2029     nz = ai[k+1] - ai[k] - 1;
2030     while (nz--) {
2031       xk += (*v++) * x[*vj++];
2032     }
2033     x[k] = xk;
2034   }
2035 
2036   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2037   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2038   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
2039   PetscFunctionReturn(0);
2040 }
2041 
2042 #undef __FUNCT__
2043 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering"
2044 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2045 {
2046   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
2047   PetscErrorCode  ierr;
2048   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2049   const MatScalar *aa=a->a,*v;
2050   PetscReal       diagk;
2051   PetscScalar     *x,*b;
2052   PetscInt        nz,*vj,k;
2053 
2054   PetscFunctionBegin;
2055   /* solve U^T*D^(1/2)*x = b by forward substitution */
2056   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2057   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2058   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2059   for (k=0; k<mbs; k++){
2060     v  = aa + ai[k];
2061     vj = aj + ai[k];
2062     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2063     while (nz--) x[*vj++] += (*v++) * x[k];
2064     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2065     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2066     x[k] *= sqrt(diagk);
2067   }
2068   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2069   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2070   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2071   PetscFunctionReturn(0);
2072 }
2073 
2074 #undef __FUNCT__
2075 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2076 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2077 {
2078   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2079   PetscErrorCode ierr;
2080   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2081   MatScalar      *aa=a->a,*v;
2082   PetscReal      diagk;
2083   PetscScalar    *x,*b;
2084   PetscInt       nz,*vj,k;
2085 
2086   PetscFunctionBegin;
2087   /* solve U^T*D^(1/2)*x = b by forward substitution */
2088   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2089   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2090   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2091   for (k=0; k<mbs; k++){
2092     v  = aa + ai[k] + 1;
2093     vj = aj + ai[k] + 1;
2094     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2095     while (nz--) x[*vj++] += (*v++) * x[k];
2096     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2097     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2098     x[k] *= sqrt(diagk);
2099   }
2100   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2101   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2102   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2103   PetscFunctionReturn(0);
2104 }
2105 
2106 #undef __FUNCT__
2107 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering"
2108 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2109 {
2110   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2111   PetscErrorCode ierr;
2112   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2113   MatScalar      *aa=a->a,*v;
2114   PetscReal      diagk;
2115   PetscScalar    *x,*b;
2116   PetscInt       nz,*vj,k;
2117 
2118   PetscFunctionBegin;
2119   /* solve D^(1/2)*U*x = b by back substitution */
2120   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2121   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2122 
2123   for (k=mbs-1; k>=0; k--){
2124     v  = aa + ai[k];
2125     vj = aj + ai[k];
2126     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2127     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2128     x[k] = sqrt(diagk)*b[k];
2129     nz = ai[k+1] - ai[k] - 1;
2130     while (nz--) x[k] += (*v++) * x[*vj++];
2131   }
2132   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2133   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2134   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2135   PetscFunctionReturn(0);
2136 }
2137 
2138 #undef __FUNCT__
2139 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2140 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2141 {
2142   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2143   PetscErrorCode ierr;
2144   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2145   MatScalar      *aa=a->a,*v;
2146   PetscReal      diagk;
2147   PetscScalar    *x,*b;
2148   PetscInt       nz,*vj,k;
2149 
2150   PetscFunctionBegin;
2151   /* solve D^(1/2)*U*x = b by back substitution */
2152   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2153   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2154 
2155   for (k=mbs-1; k>=0; k--){
2156     v  = aa + ai[k] + 1;
2157     vj = aj + ai[k] + 1;
2158     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2159     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2160     x[k] = sqrt(diagk)*b[k];
2161     nz = ai[k+1] - ai[k] - 1;
2162     while (nz--) x[k] += (*v++) * x[*vj++];
2163   }
2164   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2165   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2166   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2167   PetscFunctionReturn(0);
2168 }
2169 
2170 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2171 #undef __FUNCT__
2172 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_MSR"
2173 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2174 {
2175   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2176   PetscErrorCode ierr;
2177   const PetscInt *rip,mbs = a->mbs,*ai = a->i,*aj = a->j;
2178   PetscInt       *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2179   PetscInt       m,reallocs = 0,*levtmp;
2180   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2181   PetscInt       incrlev,*lev,shift,prow,nz;
2182   PetscReal      f = info->fill,levels = info->levels;
2183   PetscTruth     perm_identity;
2184 
2185   PetscFunctionBegin;
2186   /* check whether perm is the identity mapping */
2187   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2188 
2189   if (perm_identity){
2190     a->permute = PETSC_FALSE;
2191     ai = a->i; aj = a->j;
2192   } else { /*  non-trivial permutation */
2193     SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2194     a->permute = PETSC_TRUE;
2195     ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr);
2196     ai = a->inew; aj = a->jnew;
2197   }
2198 
2199   /* initialization */
2200   ierr  = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2201   umax  = (PetscInt)(f*ai[mbs] + 1);
2202   ierr  = PetscMalloc(umax*sizeof(PetscInt),&lev);CHKERRQ(ierr);
2203   umax += mbs + 1;
2204   shift = mbs + 1;
2205   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
2206   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
2207   iu[0] = mbs + 1;
2208   juidx = mbs + 1;
2209   /* prowl: linked list for pivot row */
2210   ierr    = PetscMalloc3(mbs,PetscInt,&prowl,mbs,PetscInt,&q,mbs,PetscInt,&levtmp);CHKERRQ(ierr);
2211   /* q: linked list for col index */
2212 
2213   for (i=0; i<mbs; i++){
2214     prowl[i] = mbs;
2215     q[i] = 0;
2216   }
2217 
2218   /* for each row k */
2219   for (k=0; k<mbs; k++){
2220     nzk  = 0;
2221     q[k] = mbs;
2222     /* copy current row into linked list */
2223     nz = ai[rip[k]+1] - ai[rip[k]];
2224     j = ai[rip[k]];
2225     while (nz--){
2226       vj = rip[aj[j++]];
2227       if (vj > k){
2228         qm = k;
2229         do {
2230           m  = qm; qm = q[m];
2231         } while(qm < vj);
2232         if (qm == vj) {
2233           SETERRQ(PETSC_ERR_PLIB,"Duplicate entry in A\n");
2234         }
2235         nzk++;
2236         q[m]       = vj;
2237         q[vj]      = qm;
2238         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2239       }
2240     }
2241 
2242     /* modify nonzero structure of k-th row by computing fill-in
2243        for each row prow to be merged in */
2244     prow = k;
2245     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2246 
2247     while (prow < k){
2248       /* merge row prow into k-th row */
2249       jmin = iu[prow] + 1;
2250       jmax = iu[prow+1];
2251       qm = k;
2252       for (j=jmin; j<jmax; j++){
2253         incrlev = lev[j-shift] + 1;
2254 	if (incrlev > levels) continue;
2255         vj      = ju[j];
2256         do {
2257           m = qm; qm = q[m];
2258         } while (qm < vj);
2259         if (qm != vj){      /* a fill */
2260           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2261           levtmp[vj] = incrlev;
2262         } else {
2263           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2264         }
2265       }
2266       prow = prowl[prow]; /* next pivot row */
2267     }
2268 
2269     /* add k to row list for first nonzero element in k-th row */
2270     if (nzk > 1){
2271       i = q[k]; /* col value of first nonzero element in k_th row of U */
2272       prowl[k] = prowl[i]; prowl[i] = k;
2273     }
2274     iu[k+1] = iu[k] + nzk;
2275 
2276     /* allocate more space to ju and lev if needed */
2277     if (iu[k+1] > umax) {
2278       /* estimate how much additional space we will need */
2279       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2280       /* just double the memory each time */
2281       maxadd = umax;
2282       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2283       umax += maxadd;
2284 
2285       /* allocate a longer ju */
2286       ierr     = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
2287       ierr     = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
2288       ierr     = PetscFree(ju);CHKERRQ(ierr);
2289       ju       = jutmp;
2290 
2291       ierr     = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
2292       ierr     = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));CHKERRQ(ierr);
2293       ierr     = PetscFree(lev);CHKERRQ(ierr);
2294       lev      = jutmp;
2295       reallocs += 2; /* count how many times we realloc */
2296     }
2297 
2298     /* save nonzero structure of k-th row in ju */
2299     i=k;
2300     while (nzk --) {
2301       i                = q[i];
2302       ju[juidx]        = i;
2303       lev[juidx-shift] = levtmp[i];
2304       juidx++;
2305     }
2306   }
2307 
2308 #if defined(PETSC_USE_INFO)
2309   if (ai[mbs] != 0) {
2310     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2311     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
2312     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
2313     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
2314     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
2315   } else {
2316     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2317   }
2318 #endif
2319 
2320   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2321   ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr);
2322   ierr = PetscFree(lev);CHKERRQ(ierr);
2323 
2324   /* put together the new matrix */
2325   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);CHKERRQ(ierr);
2326 
2327   /* ierr = PetscLogObjectParent(B,iperm);CHKERRQ(ierr); */
2328   b    = (Mat_SeqSBAIJ*)(B)->data;
2329   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2330   b->singlemalloc = PETSC_FALSE;
2331   b->free_a       = PETSC_TRUE;
2332   b->free_ij      = PETSC_TRUE;
2333   /* the next line frees the default space generated by the Create() */
2334   ierr    = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
2335   ierr    = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
2336   b->j    = ju;
2337   b->i    = iu;
2338   b->diag = 0;
2339   b->ilen = 0;
2340   b->imax = 0;
2341 
2342   if (b->row) {
2343     ierr = ISDestroy(b->row);CHKERRQ(ierr);
2344   }
2345   if (b->icol) {
2346     ierr = ISDestroy(b->icol);CHKERRQ(ierr);
2347   }
2348   b->row  = perm;
2349   b->icol = perm;
2350   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2351   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2352   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2353   /* In b structure:  Free imax, ilen, old a, old j.
2354      Allocate idnew, solve_work, new a, new j */
2355   ierr    = PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2356   b->maxnz = b->nz = iu[mbs];
2357 
2358   (B)->info.factor_mallocs    = reallocs;
2359   (B)->info.fill_ratio_given  = f;
2360   if (ai[mbs] != 0) {
2361     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2362   } else {
2363     (B)->info.fill_ratio_needed = 0.0;
2364   }
2365   ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr);
2366   PetscFunctionReturn(0);
2367 }
2368 
2369 /*
2370   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2371 */
2372 #include "petscbt.h"
2373 #include "../src/mat/utils/freespace.h"
2374 #undef __FUNCT__
2375 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ"
2376 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2377 {
2378   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2379   PetscErrorCode     ierr;
2380   PetscTruth         perm_identity,free_ij = PETSC_TRUE,missing;
2381   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2382   const PetscInt     *rip;
2383   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2384   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2385   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2386   PetscReal          fill=info->fill,levels=info->levels;
2387   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2388   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2389   PetscBT            lnkbt;
2390 
2391   PetscFunctionBegin;
2392   if (bs > 1){
2393     ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr);
2394     PetscFunctionReturn(0);
2395   }
2396   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2397   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
2398   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2399 
2400   /* check whether perm is the identity mapping */
2401   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2402   if (!perm_identity) SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2403   a->permute = PETSC_FALSE;
2404 
2405   ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
2406   ierr = PetscMalloc((am+1)*sizeof(PetscInt),&udiag);CHKERRQ(ierr);
2407   ui[0] = 0;
2408 
2409   /* ICC(0) without matrix ordering: simply rearrange column indices */
2410   if (!levels){
2411     /* reuse the column pointers and row offsets for memory savings */
2412     for (i=0; i<am; i++) {
2413       ncols    = ai[i+1] - ai[i];
2414       ui[i+1]  = ui[i] + ncols;
2415       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2416     }
2417     ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
2418     cols = uj;
2419     for (i=0; i<am; i++) {
2420       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2421       ncols = ai[i+1] - ai[i] -1;
2422       for (j=0; j<ncols; j++) *cols++ = aj[j];
2423       *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2424     }
2425   } else { /* case: levels>0 */
2426     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2427 
2428     /* initialization */
2429     /* jl: linked list for storing indices of the pivot rows
2430        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2431     ierr = PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr);
2432     for (i=0; i<am; i++){
2433       jl[i] = am; il[i] = 0;
2434     }
2435 
2436     /* create and initialize a linked list for storing column indices of the active row k */
2437     nlnk = am + 1;
2438     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2439 
2440     /* initial FreeSpace size is fill*(ai[am]+1) */
2441     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
2442     current_space = free_space;
2443     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
2444     current_space_lvl = free_space_lvl;
2445 
2446     for (k=0; k<am; k++){  /* for each active row k */
2447       /* initialize lnk by the column indices of row k */
2448       nzk   = 0;
2449       ncols = ai[k+1] - ai[k];
2450       if (!ncols) SETERRQ1(PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2451       cols  = aj+ai[k];
2452       ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2453       nzk += nlnk;
2454 
2455       /* update lnk by computing fill-in for each pivot row to be merged in */
2456       prow = jl[k]; /* 1st pivot row */
2457 
2458       while (prow < k){
2459         nextprow = jl[prow];
2460 
2461         /* merge prow into k-th row */
2462         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
2463         jmax = ui[prow+1];
2464         ncols = jmax-jmin;
2465         i     = jmin - ui[prow];
2466 
2467         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2468         uj    = uj_lvl_ptr[prow] + i; /* levels of cols */
2469         j     = *(uj - 1);
2470         ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2471         nzk += nlnk;
2472 
2473         /* update il and jl for prow */
2474         if (jmin < jmax){
2475           il[prow] = jmin;
2476           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2477         }
2478         prow = nextprow;
2479       }
2480 
2481       /* if free space is not available, make more free space */
2482       if (current_space->local_remaining<nzk) {
2483         i = am - k + 1; /* num of unfactored rows */
2484         i *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
2485         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2486         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2487         reallocs++;
2488       }
2489 
2490       /* copy data into free_space and free_space_lvl, then initialize lnk */
2491       if (nzk == 0) SETERRQ1(PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2492       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2493 
2494       /* add the k-th row into il and jl */
2495       if (nzk > 1){
2496         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2497         jl[k] = jl[i]; jl[i] = k;
2498         il[k] = ui[k] + 1;
2499       }
2500       uj_ptr[k]     = current_space->array;
2501       uj_lvl_ptr[k] = current_space_lvl->array;
2502 
2503       current_space->array           += nzk;
2504       current_space->local_used      += nzk;
2505       current_space->local_remaining -= nzk;
2506       current_space_lvl->array           += nzk;
2507       current_space_lvl->local_used      += nzk;
2508       current_space_lvl->local_remaining -= nzk;
2509 
2510       ui[k+1] = ui[k] + nzk;
2511     }
2512 
2513 #if defined(PETSC_USE_INFO)
2514     if (ai[am] != 0) {
2515       PetscReal af = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2516       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
2517       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
2518       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
2519     } else {
2520       ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2521     }
2522 #endif
2523 
2524     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2525     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2526 
2527     /* destroy list of free space and other temporary array(s) */
2528     ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
2529     ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor  */
2530     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2531     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2532 
2533   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2534 
2535   /* put together the new matrix in MATSEQSBAIJ format */
2536   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
2537 
2538   b = (Mat_SeqSBAIJ*)(fact)->data;
2539   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2540   b->singlemalloc = PETSC_FALSE;
2541   b->free_a  = PETSC_TRUE;
2542   b->free_ij = free_ij;
2543   ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
2544   b->j     = uj;
2545   b->i     = ui;
2546   b->diag  = udiag;
2547   b->free_diag = PETSC_TRUE;
2548   b->ilen  = 0;
2549   b->imax  = 0;
2550   b->row   = perm;
2551   b->col   = perm;
2552   ierr     = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2553   ierr     = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2554   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2555   ierr = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2556   ierr = PetscLogObjectMemory(fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2557   b->maxnz = b->nz = ui[am];
2558 
2559   fact->info.factor_mallocs    = reallocs;
2560   fact->info.fill_ratio_given  = fill;
2561   if (ai[am] != 0) {
2562     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2563   } else {
2564     fact->info.fill_ratio_needed = 0.0;
2565   }
2566   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2567   PetscFunctionReturn(0);
2568 }
2569 
2570 #undef __FUNCT__
2571 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_inplace"
2572 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2573 {
2574   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2575   Mat_SeqSBAIJ       *b;
2576   PetscErrorCode     ierr;
2577   PetscTruth         perm_identity,free_ij = PETSC_TRUE,missing;
2578   PetscInt           bs=A->rmap->bs,am=a->mbs,d;
2579   const PetscInt     *cols,*rip,*ai,*aj;
2580   PetscInt           reallocs=0,i,*ui;
2581   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2582   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2583   PetscReal          fill=info->fill,levels=info->levels,ratio_needed;
2584   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2585   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2586   PetscBT            lnkbt;
2587 
2588   PetscFunctionBegin;
2589   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
2590   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2591 
2592   /*
2593    This code originally uses Modified Sparse Row (MSR) storage
2594    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2595    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2596    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2597    thus the original code in MSR format is still used for these cases.
2598    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2599    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2600   */
2601   if (bs > 1){
2602     ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
2603     PetscFunctionReturn(0);
2604   }
2605 
2606   /* check whether perm is the identity mapping */
2607   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2608   if (!perm_identity) SETERRQ(PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format")
2609 
2610   /* special case that simply copies fill pattern */
2611   if (!levels ) {
2612     a->permute = PETSC_FALSE;
2613     /* reuse the column pointers and row offsets for memory savings */
2614     ui           = a->i;
2615     uj           = a->j;
2616     free_ij      = PETSC_FALSE;
2617     ratio_needed = 1.0;
2618   } else { /* case: levels>0 */
2619     if (perm_identity){
2620       a->permute = PETSC_FALSE;
2621       ai = a->i; aj = a->j;
2622     }
2623     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2624 
2625     /* initialization */
2626     ierr  = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
2627     ui[0] = 0;
2628 
2629     /* jl: linked list for storing indices of the pivot rows
2630        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2631     ierr = PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr);
2632     for (i=0; i<am; i++){
2633       jl[i] = am; il[i] = 0;
2634     }
2635 
2636     /* create and initialize a linked list for storing column indices of the active row k */
2637     nlnk = am + 1;
2638     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2639 
2640     /* initial FreeSpace size is fill*(ai[am]+1) */
2641     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
2642     current_space = free_space;
2643     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
2644     current_space_lvl = free_space_lvl;
2645 
2646     for (k=0; k<am; k++){  /* for each active row k */
2647       /* initialize lnk by the column indices of row rip[k] */
2648       nzk   = 0;
2649       ncols = ai[rip[k]+1] - ai[rip[k]];
2650       cols  = aj+ai[rip[k]];
2651       ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2652       nzk += nlnk;
2653 
2654       /* update lnk by computing fill-in for each pivot row to be merged in */
2655       prow = jl[k]; /* 1st pivot row */
2656 
2657       while (prow < k){
2658         nextprow = jl[prow];
2659 
2660         /* merge prow into k-th row */
2661         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
2662         jmax = ui[prow+1];
2663         ncols = jmax-jmin;
2664         i     = jmin - ui[prow];
2665         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2666         j     = *(uj_lvl_ptr[prow] + i - 1);
2667         cols_lvl = uj_lvl_ptr[prow]+i;
2668         ierr = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2669         nzk += nlnk;
2670 
2671         /* update il and jl for prow */
2672         if (jmin < jmax){
2673           il[prow] = jmin;
2674           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2675         }
2676         prow = nextprow;
2677       }
2678 
2679       /* if free space is not available, make more free space */
2680       if (current_space->local_remaining<nzk) {
2681         i = am - k + 1; /* num of unfactored rows */
2682         i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2683         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2684         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2685         reallocs++;
2686       }
2687 
2688       /* copy data into free_space and free_space_lvl, then initialize lnk */
2689       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2690 
2691       /* add the k-th row into il and jl */
2692       if (nzk-1 > 0){
2693         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2694         jl[k] = jl[i]; jl[i] = k;
2695         il[k] = ui[k] + 1;
2696       }
2697       uj_ptr[k]     = current_space->array;
2698       uj_lvl_ptr[k] = current_space_lvl->array;
2699 
2700       current_space->array           += nzk;
2701       current_space->local_used      += nzk;
2702       current_space->local_remaining -= nzk;
2703       current_space_lvl->array           += nzk;
2704       current_space_lvl->local_used      += nzk;
2705       current_space_lvl->local_remaining -= nzk;
2706 
2707       ui[k+1] = ui[k] + nzk;
2708     }
2709 
2710 #if defined(PETSC_USE_INFO)
2711     if (ai[am] != 0) {
2712       PetscReal af = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2713       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
2714       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
2715       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
2716     } else {
2717       ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2718     }
2719 #endif
2720 
2721     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2722     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2723 
2724     /* destroy list of free space and other temporary array(s) */
2725     ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
2726     ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
2727     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2728     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2729     if (ai[am] != 0) {
2730       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2731     } else {
2732       ratio_needed = 0.0;
2733     }
2734   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2735 
2736   /* put together the new matrix in MATSEQSBAIJ format */
2737   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
2738 
2739   b = (Mat_SeqSBAIJ*)(fact)->data;
2740   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2741   b->singlemalloc = PETSC_FALSE;
2742   b->free_a  = PETSC_TRUE;
2743   b->free_ij = free_ij;
2744   ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
2745   b->j    = uj;
2746   b->i    = ui;
2747   b->diag    = 0;
2748   b->ilen    = 0;
2749   b->imax    = 0;
2750   b->row     = perm;
2751   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2752   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2753   b->icol = perm;
2754   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2755   ierr    = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2756   b->maxnz = b->nz = ui[am];
2757 
2758   fact->info.factor_mallocs    = reallocs;
2759   fact->info.fill_ratio_given  = fill;
2760   fact->info.fill_ratio_needed = ratio_needed;
2761   if (perm_identity){
2762     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2763   } else {
2764     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2765   }
2766   PetscFunctionReturn(0);
2767 }
2768 
2769