xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision 01a79839fc82a7dabb7a87cd2a8bb532c6bfa88d)
1 #define PETSCMAT_DLL
2 
3 /*
4     Factorization code for SBAIJ format.
5 */
6 
7 #include "../src/mat/impls/sbaij/seq/sbaij.h"
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_inplace"
13 PetscErrorCode MatSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
16   IS              isrow=a->row;
17   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j;
18   PetscErrorCode  ierr;
19   const PetscInt  *r;
20   PetscInt        nz,*vj,k,idx,k1;
21   PetscInt        bs=A->rmap->bs,bs2 = a->bs2;
22   MatScalar       *aa=a->a,*v,*diag;
23   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;
24 
25   PetscFunctionBegin;
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28   t  = a->solve_work;
29   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
30   ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr);
31 
32   /* solve U^T * D * y = b by forward substitution */
33   xk = t;
34   for (k=0; k<mbs; k++) { /* t <- perm(b) */
35     idx   = bs*r[k];
36     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
37   }
38   for (k=0; k<mbs; k++){
39     v  = aa + bs2*ai[k];
40     xk = t + k*bs;      /* Dk*xk = k-th block of x */
41     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
42     nz = ai[k+1] - ai[k];
43     vj = aj + ai[k];
44     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
45     while (nz--) {
46       /* x(:) += U(k,:)^T*(Dk*xk) */
47       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
48       vj++; xj = t + (*vj)*bs;
49       v += bs2;
50     }
51     /* xk = inv(Dk)*(Dk*xk) */
52     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
53     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
54   }
55 
56   /* solve U*x = y by back substitution */
57   for (k=mbs-1; k>=0; k--){
58     v  = aa + bs2*ai[k];
59     xk = t + k*bs;        /* xk */
60     nz = ai[k+1] - ai[k];
61     vj = aj + ai[k];
62     xj = t + (*vj)*bs;
63     while (nz--) {
64       /* xk += U(k,:)*x(:) */
65       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
66       vj++;
67       v += bs2; xj = t + (*vj)*bs;
68     }
69     idx   = bs*r[k];
70     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
71   }
72 
73   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
74   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
75   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
76   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
77   ierr = PetscLogFlops(4.0*bs2*a->nz -(bs+2.0*bs2)*mbs);CHKERRQ(ierr);
78   PetscFunctionReturn(0);
79 }
80 
81 #undef __FUNCT__
82 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_inplace"
83 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
84 {
85   PetscFunctionBegin;
86   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
87   PetscFunctionReturn(0);
88 }
89 
90 #undef __FUNCT__
91 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_inplace"
92 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
93 {
94   PetscFunctionBegin;
95   SETERRQ(PETSC_COMM_SELF,1,"not implemented yet");
96   PetscFunctionReturn(0);
97 }
98 
99 #undef __FUNCT__
100 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private"
101 PetscErrorCode ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
102 {
103   PetscErrorCode ierr;
104   PetscInt       nz,*vj,k;
105   PetscInt       bs2 = bs*bs;
106   MatScalar      *v,*diag;
107   PetscScalar    *xk,*xj,*xk_tmp;
108 
109   PetscFunctionBegin;
110   ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr);
111   for (k=0; k<mbs; k++){
112     v  = aa + bs2*ai[k];
113     xk = x + k*bs;      /* Dk*xk = k-th block of x */
114     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
115     nz = ai[k+1] - ai[k];
116     vj = aj + ai[k];
117     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
118     while (nz--) {
119       /* x(:) += U(k,:)^T*(Dk*xk) */
120       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
121       vj++; xj = x + (*vj)*bs;
122       v += bs2;
123     }
124     /* xk = inv(Dk)*(Dk*xk) */
125     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
126     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
127   }
128   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
129   PetscFunctionReturn(0);
130 }
131 
132 #undef __FUNCT__
133 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private"
134 PetscErrorCode BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscInt bs,PetscScalar *x)
135 {
136   PetscInt       nz,*vj,k;
137   PetscInt       bs2 = bs*bs;
138   MatScalar      *v;
139   PetscScalar    *xk,*xj;
140 
141   PetscFunctionBegin;
142   for (k=mbs-1; k>=0; k--){
143     v  = aa + bs2*ai[k];
144     xk = x + k*bs;        /* xk */
145     nz = ai[k+1] - ai[k];
146     vj = aj + ai[k];
147     xj = x + (*vj)*bs;
148     while (nz--) {
149       /* xk += U(k,:)*x(:) */
150       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
151       vj++;
152       v += bs2; xj = x + (*vj)*bs;
153     }
154   }
155   PetscFunctionReturn(0);
156 }
157 
158 #undef __FUNCT__
159 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
160 PetscErrorCode MatSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
161 {
162   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
163   PetscErrorCode ierr;
164   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
165   PetscInt       bs=A->rmap->bs;
166   MatScalar      *aa=a->a;
167   PetscScalar    *x,*b;
168 
169   PetscFunctionBegin;
170   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
171   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
172 
173   /* solve U^T * D * y = b by forward substitution */
174   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
175   ierr = ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
176 
177   /* solve U*x = y by back substitution */
178   ierr = BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
179 
180   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
181   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
182   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
183   PetscFunctionReturn(0);
184 }
185 
186 #undef __FUNCT__
187 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
188 PetscErrorCode MatForwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
189 {
190   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
191   PetscErrorCode ierr;
192   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
193   PetscInt       bs=A->rmap->bs;
194   MatScalar      *aa=a->a;
195   PetscScalar    *x,*b;
196 
197   PetscFunctionBegin;
198   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
199   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
200   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
201   ierr = ForwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
202   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
203   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
204   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
205   PetscFunctionReturn(0);
206 }
207 
208 #undef __FUNCT__
209 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace"
210 PetscErrorCode MatBackwardSolve_SeqSBAIJ_N_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
211 {
212   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
213   PetscErrorCode ierr;
214   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
215   PetscInt       bs=A->rmap->bs;
216   MatScalar      *aa=a->a;
217   PetscScalar    *x,*b;
218 
219   PetscFunctionBegin;
220   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
221   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
222   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
223   ierr = BackwardSolve_SeqSBAIJ_N_NaturalOrdering_private(ai,aj,aa,mbs,bs,x);CHKERRQ(ierr);
224   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
225   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
226   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
227   PetscFunctionReturn(0);
228 }
229 
230 #undef __FUNCT__
231 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_inplace"
232 PetscErrorCode MatSolve_SeqSBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
233 {
234   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
235   IS             isrow=a->row;
236   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2,bs=A->rmap->bs;
237   PetscErrorCode ierr;
238   const PetscInt *r;
239   PetscInt       nz,*vj,k,idx;
240   MatScalar      *aa=a->a,*v,*d;
241   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
242 
243   PetscFunctionBegin;
244   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
245   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
246   t  = a->solve_work;
247   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
248 
249   /* solve U^T * D * y = b by forward substitution */
250   tp = t;
251   for (k=0; k<mbs; k++) { /* t <- perm(b) */
252     idx   = 7*r[k];
253     tp[0] = b[idx];
254     tp[1] = b[idx+1];
255     tp[2] = b[idx+2];
256     tp[3] = b[idx+3];
257     tp[4] = b[idx+4];
258     tp[5] = b[idx+5];
259     tp[6] = b[idx+6];
260     tp += 7;
261   }
262 
263   for (k=0; k<mbs; k++){
264     v  = aa + 49*ai[k];
265     vj = aj + ai[k];
266     tp = t + k*7;
267     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
268     nz = ai[k+1] - ai[k];
269     tp = t + (*vj)*7;
270     while (nz--) {
271       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
272       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
273       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
274       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
275       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
276       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
277       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
278       vj++; tp = t + (*vj)*7;
279       v += 49;
280     }
281 
282     /* xk = inv(Dk)*(Dk*xk) */
283     d  = aa+k*49;          /* ptr to inv(Dk) */
284     tp    = t + k*7;
285     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
286     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
287     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
288     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
289     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
290     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
291     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
292   }
293 
294   /* solve U*x = y by back substitution */
295   for (k=mbs-1; k>=0; k--){
296     v  = aa + 49*ai[k];
297     vj = aj + ai[k];
298     tp    = t + k*7;
299     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
300     nz = ai[k+1] - ai[k];
301 
302     tp = t + (*vj)*7;
303     while (nz--) {
304       /* xk += U(k,:)*x(:) */
305       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
306       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
307       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
308       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
309       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
310       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
311       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
312       vj++; tp = t + (*vj)*7;
313       v += 49;
314     }
315     tp    = t + k*7;
316     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
317     idx   = 7*r[k];
318     x[idx]     = x0;
319     x[idx+1]   = x1;
320     x[idx+2]   = x2;
321     x[idx+3]   = x3;
322     x[idx+4]   = x4;
323     x[idx+5]   = x5;
324     x[idx+6]   = x6;
325   }
326 
327   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
328   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
329   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
330   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
331   PetscFunctionReturn(0);
332 }
333 
334 #undef __FUNCT__
335 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private"
336 PetscErrorCode ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
337 {
338   MatScalar      *v,*d;
339   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
340   PetscInt       nz,*vj,k;
341 
342   PetscFunctionBegin;
343   for (k=0; k<mbs; k++){
344     v  = aa + 49*ai[k];
345     xp = x + k*7;
346     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
347     nz = ai[k+1] - ai[k];
348     vj = aj + ai[k];
349     xp = x + (*vj)*7;
350     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
351     PetscPrefetchBlock(v+49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
352     while (nz--) {
353       /* x(:) += U(k,:)^T*(Dk*xk) */
354       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
355       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
356       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
357       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
358       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
359       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
360       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
361       vj++; xp = x + (*vj)*7;
362       v += 49;
363     }
364     /* xk = inv(Dk)*(Dk*xk) */
365     d  = aa+k*49;          /* ptr to inv(Dk) */
366     xp = x + k*7;
367     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
368     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
369     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
370     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
371     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
372     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
373     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
374   }
375   PetscFunctionReturn(0);
376 }
377 
378 #undef __FUNCT__
379 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private"
380 PetscErrorCode BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
381 {
382   MatScalar      *v;
383   PetscScalar    *xp,x0,x1,x2,x3,x4,x5,x6;
384   PetscInt       nz,*vj,k;
385 
386   PetscFunctionBegin;
387   for (k=mbs-1; k>=0; k--){
388     v  = aa + 49*ai[k];
389     xp = x + k*7;
390     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
391     nz = ai[k+1] - ai[k];
392     vj = aj + ai[k];
393     xp = x + (*vj)*7;
394     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
395     PetscPrefetchBlock(v-49*nz,49*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
396     while (nz--) {
397       /* xk += U(k,:)*x(:) */
398       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
399       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
400       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
401       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
402       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
403       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
404       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
405       vj++;
406       v += 49; xp = x + (*vj)*7;
407     }
408     xp = x + k*7;
409     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
410   }
411   PetscFunctionReturn(0);
412 }
413 
414 #undef __FUNCT__
415 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
416 PetscErrorCode MatSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
417 {
418   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
419   PetscErrorCode ierr;
420   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
421   MatScalar      *aa=a->a;
422   PetscScalar    *x,*b;
423 
424   PetscFunctionBegin;
425   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
426   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
427 
428   /* solve U^T * D * y = b by forward substitution */
429   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
430   ierr = ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
431 
432   /* solve U*x = y by back substitution */
433   ierr = BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
434 
435   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
436   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
437   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
438   PetscFunctionReturn(0);
439 }
440 
441 #undef __FUNCT__
442 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
443 PetscErrorCode MatForwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
444 {
445   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
446   PetscErrorCode ierr;
447   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
448   MatScalar      *aa=a->a;
449   PetscScalar    *x,*b;
450 
451   PetscFunctionBegin;
452   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
453   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
454   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
455   ierr = ForwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
456   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
457   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
458   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
459   PetscFunctionReturn(0);
460 }
461 
462 #undef __FUNCT__
463 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace"
464 PetscErrorCode MatBackwardSolve_SeqSBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
465 {
466   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
467   PetscErrorCode ierr;
468   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
469   MatScalar      *aa=a->a;
470   PetscScalar    *x,*b;
471 
472   PetscFunctionBegin;
473   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
474   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
475   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
476   ierr = BackwardSolve_SeqSBAIJ_7_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
477   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
478   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
479   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
480   PetscFunctionReturn(0);
481 }
482 
483 #undef __FUNCT__
484 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_inplace"
485 PetscErrorCode MatSolve_SeqSBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
486 {
487   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
488   IS             isrow=a->row;
489   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
490   PetscErrorCode ierr;
491   const PetscInt *r;
492   PetscInt       nz,*vj,k,idx;
493   MatScalar      *aa=a->a,*v,*d;
494   PetscScalar    *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
495 
496   PetscFunctionBegin;
497   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
498   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
499   t  = a->solve_work;
500   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
501 
502   /* solve U^T * D * y = b by forward substitution */
503   tp = t;
504   for (k=0; k<mbs; k++) { /* t <- perm(b) */
505     idx   = 6*r[k];
506     tp[0] = b[idx];
507     tp[1] = b[idx+1];
508     tp[2] = b[idx+2];
509     tp[3] = b[idx+3];
510     tp[4] = b[idx+4];
511     tp[5] = b[idx+5];
512     tp += 6;
513   }
514 
515   for (k=0; k<mbs; k++){
516     v  = aa + 36*ai[k];
517     vj = aj + ai[k];
518     tp = t + k*6;
519     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
520     nz = ai[k+1] - ai[k];
521     tp = t + (*vj)*6;
522     while (nz--) {
523       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
524       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
525       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
526       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
527       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
528       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
529       vj++; tp = t + (*vj)*6;
530       v += 36;
531     }
532 
533     /* xk = inv(Dk)*(Dk*xk) */
534     d  = aa+k*36;          /* ptr to inv(Dk) */
535     tp    = t + k*6;
536     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
537     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
538     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
539     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
540     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
541     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
542   }
543 
544   /* solve U*x = y by back substitution */
545   for (k=mbs-1; k>=0; k--){
546     v  = aa + 36*ai[k];
547     vj = aj + ai[k];
548     tp    = t + k*6;
549     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
550     nz = ai[k+1] - ai[k];
551 
552     tp = t + (*vj)*6;
553     while (nz--) {
554       /* xk += U(k,:)*x(:) */
555       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
556       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
557       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
558       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
559       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
560       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
561       vj++; tp = t + (*vj)*6;
562       v += 36;
563     }
564     tp    = t + k*6;
565     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
566     idx   = 6*r[k];
567     x[idx]     = x0;
568     x[idx+1]   = x1;
569     x[idx+2]   = x2;
570     x[idx+3]   = x3;
571     x[idx+4]   = x4;
572     x[idx+5]   = x5;
573   }
574 
575   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
576   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
577   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
578   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
579   PetscFunctionReturn(0);
580 }
581 
582 #undef __FUNCT__
583 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private"
584 PetscErrorCode ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
585 {
586   MatScalar      *v,*d;
587   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
588   PetscInt       nz,*vj,k;
589 
590   PetscFunctionBegin;
591   for (k=0; k<mbs; k++){
592     v  = aa + 36*ai[k];
593     xp = x + k*6;
594     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
595     nz = ai[k+1] - ai[k];
596     vj = aj + ai[k];
597     xp = x + (*vj)*6;
598     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
599     PetscPrefetchBlock(v+36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
600     while (nz--) {
601       /* x(:) += U(k,:)^T*(Dk*xk) */
602       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
603       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
604       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
605       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
606       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
607       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
608       vj++; xp = x + (*vj)*6;
609       v += 36;
610     }
611     /* xk = inv(Dk)*(Dk*xk) */
612     d  = aa+k*36;          /* ptr to inv(Dk) */
613     xp = x + k*6;
614     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
615     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
616     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
617     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
618     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
619     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
620   }
621   PetscFunctionReturn(0);
622 }
623 #undef __FUNCT__
624 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private"
625 PetscErrorCode BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
626 {
627   MatScalar      *v;
628   PetscScalar    *xp,x0,x1,x2,x3,x4,x5;
629   PetscInt       nz,*vj,k;
630 
631   PetscFunctionBegin;
632   for (k=mbs-1; k>=0; k--){
633     v  = aa + 36*ai[k];
634     xp = x + k*6;
635     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
636     nz = ai[k+1] - ai[k];
637     vj = aj + ai[k];
638     xp = x + (*vj)*6;
639     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
640     PetscPrefetchBlock(v-36*nz,36*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
641     while (nz--) {
642       /* xk += U(k,:)*x(:) */
643       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
644       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
645       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
646       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
647       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
648       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
649       vj++;
650       v += 36; xp = x + (*vj)*6;
651     }
652     xp = x + k*6;
653     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
654   }
655   PetscFunctionReturn(0);
656 }
657 
658 
659 #undef __FUNCT__
660 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
661 PetscErrorCode MatSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
662 {
663   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
664   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
665   MatScalar      *aa=a->a;
666   PetscScalar    *x,*b;
667   PetscErrorCode ierr;
668 
669   PetscFunctionBegin;
670   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
671   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
672 
673   /* solve U^T * D * y = b by forward substitution */
674   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
675   ierr = ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
676 
677   /* solve U*x = y by back substitution */
678   ierr = BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
679 
680   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
681   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
682   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
683   PetscFunctionReturn(0);
684 }
685 
686 #undef __FUNCT__
687 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
688 PetscErrorCode MatForwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
689 {
690   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
691   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
692   MatScalar      *aa=a->a;
693   PetscScalar    *x,*b;
694   PetscErrorCode ierr;
695 
696   PetscFunctionBegin;
697   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
698   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
699   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
700   ierr = ForwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
701   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
702   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
703   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
704   PetscFunctionReturn(0);
705 }
706 
707 #undef __FUNCT__
708 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace"
709 PetscErrorCode MatBackwardSolve_SeqSBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
710 {
711   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
712   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
713   MatScalar      *aa=a->a;
714   PetscScalar    *x,*b;
715   PetscErrorCode ierr;
716 
717   PetscFunctionBegin;
718   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
719   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
720   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
721   ierr = BackwardSolve_SeqSBAIJ_6_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
722   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
723   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
724   ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr);
725   PetscFunctionReturn(0);
726 }
727 
728 #undef __FUNCT__
729 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_inplace"
730 PetscErrorCode MatSolve_SeqSBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
731 {
732   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
733   IS             isrow=a->row;
734   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
735   PetscErrorCode ierr;
736   const PetscInt *r;
737   PetscInt       nz,*vj,k,idx;
738   MatScalar      *aa=a->a,*v,*diag;
739   PetscScalar    *x,*b,x0,x1,x2,x3,x4,*t,*tp;
740 
741   PetscFunctionBegin;
742   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
743   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
744   t  = a->solve_work;
745   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
746 
747   /* solve U^T * D * y = b by forward substitution */
748   tp = t;
749   for (k=0; k<mbs; k++) { /* t <- perm(b) */
750     idx   = 5*r[k];
751     tp[0] = b[idx];
752     tp[1] = b[idx+1];
753     tp[2] = b[idx+2];
754     tp[3] = b[idx+3];
755     tp[4] = b[idx+4];
756     tp += 5;
757   }
758 
759   for (k=0; k<mbs; k++){
760     v  = aa + 25*ai[k];
761     vj = aj + ai[k];
762     tp = t + k*5;
763     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
764     nz = ai[k+1] - ai[k];
765 
766     tp = t + (*vj)*5;
767     while (nz--) {
768       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
769       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
770       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
771       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
772       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
773       vj++; tp = t + (*vj)*5;
774       v += 25;
775     }
776 
777     /* xk = inv(Dk)*(Dk*xk) */
778     diag  = aa+k*25;          /* ptr to inv(Dk) */
779     tp    = t + k*5;
780       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
781       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
782       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
783       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
784       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
785   }
786 
787   /* solve U*x = y by back substitution */
788   for (k=mbs-1; k>=0; k--){
789     v  = aa + 25*ai[k];
790     vj = aj + ai[k];
791     tp    = t + k*5;
792     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
793     nz = ai[k+1] - ai[k];
794 
795     tp = t + (*vj)*5;
796     while (nz--) {
797       /* xk += U(k,:)*x(:) */
798       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
799       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
800       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
801       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
802       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
803       vj++; tp = t + (*vj)*5;
804       v += 25;
805     }
806     tp    = t + k*5;
807     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
808     idx   = 5*r[k];
809     x[idx]     = x0;
810     x[idx+1]   = x1;
811     x[idx+2]   = x2;
812     x[idx+3]   = x3;
813     x[idx+4]   = x4;
814   }
815 
816   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
817   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
818   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
819   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
820   PetscFunctionReturn(0);
821 }
822 
823 #undef __FUNCT__
824 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private"
825 PetscErrorCode ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
826 {
827   MatScalar      *v,*diag;
828   PetscScalar    *xp,x0,x1,x2,x3,x4;
829   PetscInt       nz,*vj,k;
830 
831   PetscFunctionBegin;
832   for (k=0; k<mbs; k++){
833     v  = aa + 25*ai[k];
834     xp = x + k*5;
835     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
836     nz = ai[k+1] - ai[k];
837     vj = aj + ai[k];
838     xp = x + (*vj)*5;
839     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
840     PetscPrefetchBlock(v+25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
841     while (nz--) {
842       /* x(:) += U(k,:)^T*(Dk*xk) */
843       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
844       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
845       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
846       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
847       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
848       vj++; xp = x + (*vj)*5;
849       v += 25;
850     }
851     /* xk = inv(Dk)*(Dk*xk) */
852     diag = aa+k*25;          /* ptr to inv(Dk) */
853     xp   = x + k*5;
854     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
855     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
856     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
857     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
858     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
859   }
860   PetscFunctionReturn(0);
861 }
862 
863 #undef __FUNCT__
864 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private"
865 PetscErrorCode BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
866 {
867   MatScalar      *v;
868   PetscScalar    *xp,x0,x1,x2,x3,x4;
869   PetscInt       nz,*vj,k;
870 
871   PetscFunctionBegin;
872   for (k=mbs-1; k>=0; k--){
873     v  = aa + 25*ai[k];
874     xp = x + k*5;
875     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
876     nz = ai[k+1] - ai[k];
877     vj = aj + ai[k];
878     xp = x + (*vj)*5;
879     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
880     PetscPrefetchBlock(v-25*nz,25*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
881     while (nz--) {
882       /* xk += U(k,:)*x(:) */
883       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
884       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
885       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
886       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
887       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
888       vj++;
889       v += 25; xp = x + (*vj)*5;
890     }
891     xp = x + k*5;
892     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
893   }
894   PetscFunctionReturn(0);
895 }
896 
897 #undef __FUNCT__
898 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
899 PetscErrorCode MatSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
900 {
901   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
902   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2 = a->bs2;
903   MatScalar      *aa=a->a;
904   PetscScalar    *x,*b;
905   PetscErrorCode ierr;
906 
907   PetscFunctionBegin;
908   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
909   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
910 
911   /* solve U^T * D * y = b by forward substitution */
912   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
913   ierr = ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
914 
915   /* solve U*x = y by back substitution */
916   ierr = BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
917 
918   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
919   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
920   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
921   PetscFunctionReturn(0);
922 }
923 
924 #undef __FUNCT__
925 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
926 PetscErrorCode MatForwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
927 {
928   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
929   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
930   MatScalar      *aa=a->a;
931   PetscScalar    *x,*b;
932   PetscErrorCode ierr;
933 
934   PetscFunctionBegin;
935   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
936   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
937   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
938   ierr = ForwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
939   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
940   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
941   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
942   PetscFunctionReturn(0);
943 }
944 
945 #undef __FUNCT__
946 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace"
947 PetscErrorCode MatBackwardSolve_SeqSBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
948 {
949   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
950   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
951   MatScalar      *aa=a->a;
952   PetscScalar    *x,*b;
953   PetscErrorCode ierr;
954 
955   PetscFunctionBegin;
956   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
957   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
958   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
959   ierr = BackwardSolve_SeqSBAIJ_5_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
960   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
961   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
962   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
963   PetscFunctionReturn(0);
964 }
965 
966 #undef __FUNCT__
967 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_inplace"
968 PetscErrorCode MatSolve_SeqSBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
969 {
970   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
971   IS             isrow=a->row;
972   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
973   PetscErrorCode ierr;
974   const PetscInt *r;
975   PetscInt       nz,*vj,k,idx;
976   MatScalar      *aa=a->a,*v,*diag;
977   PetscScalar    *x,*b,x0,x1,x2,x3,*t,*tp;
978 
979   PetscFunctionBegin;
980   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
981   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
982   t  = a->solve_work;
983   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
984 
985   /* solve U^T * D * y = b by forward substitution */
986   tp = t;
987   for (k=0; k<mbs; k++) { /* t <- perm(b) */
988     idx   = 4*r[k];
989     tp[0] = b[idx];
990     tp[1] = b[idx+1];
991     tp[2] = b[idx+2];
992     tp[3] = b[idx+3];
993     tp += 4;
994   }
995 
996   for (k=0; k<mbs; k++){
997     v  = aa + 16*ai[k];
998     vj = aj + ai[k];
999     tp = t + k*4;
1000     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
1001     nz = ai[k+1] - ai[k];
1002 
1003     tp = t + (*vj)*4;
1004     while (nz--) {
1005       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1006       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1007       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1008       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1009       vj++; tp = t + (*vj)*4;
1010       v += 16;
1011     }
1012 
1013     /* xk = inv(Dk)*(Dk*xk) */
1014     diag  = aa+k*16;          /* ptr to inv(Dk) */
1015     tp    = t + k*4;
1016     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1017     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1018     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1019     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1020   }
1021 
1022   /* solve U*x = y by back substitution */
1023   for (k=mbs-1; k>=0; k--){
1024     v  = aa + 16*ai[k];
1025     vj = aj + ai[k];
1026     tp    = t + k*4;
1027     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
1028     nz = ai[k+1] - ai[k];
1029 
1030     tp = t + (*vj)*4;
1031     while (nz--) {
1032       /* xk += U(k,:)*x(:) */
1033       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
1034       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
1035       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
1036       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
1037       vj++; tp = t + (*vj)*4;
1038       v += 16;
1039     }
1040     tp    = t + k*4;
1041     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
1042     idx        = 4*r[k];
1043     x[idx]     = x0;
1044     x[idx+1]   = x1;
1045     x[idx+2]   = x2;
1046     x[idx+3]   = x3;
1047   }
1048 
1049   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1050   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1051   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1052   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1053    PetscFunctionReturn(0);
1054 }
1055 
1056 #undef __FUNCT__
1057 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private"
1058 PetscErrorCode ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1059 {
1060   MatScalar      *v,*diag;
1061   PetscScalar    *xp,x0,x1,x2,x3;
1062   PetscInt       nz,*vj,k;
1063 
1064   PetscFunctionBegin;
1065   for (k=0; k<mbs; k++){
1066     v  = aa + 16*ai[k];
1067     xp = x + k*4;
1068     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
1069     nz = ai[k+1] - ai[k];
1070     vj = aj + ai[k];
1071     xp = x + (*vj)*4;
1072     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1073     PetscPrefetchBlock(v+16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1074     while (nz--) {
1075       /* x(:) += U(k,:)^T*(Dk*xk) */
1076       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
1077       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
1078       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
1079       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
1080       vj++; xp = x + (*vj)*4;
1081       v += 16;
1082     }
1083     /* xk = inv(Dk)*(Dk*xk) */
1084     diag = aa+k*16;          /* ptr to inv(Dk) */
1085     xp   = x + k*4;
1086     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
1087     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
1088     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
1089     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
1090   }
1091   PetscFunctionReturn(0);
1092 }
1093 
1094 #undef __FUNCT__
1095 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private"
1096 PetscErrorCode BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1097 {
1098   MatScalar      *v;
1099   PetscScalar    *xp,x0,x1,x2,x3;
1100   PetscInt       nz,*vj,k;
1101 
1102   PetscFunctionBegin;
1103   for (k=mbs-1; k>=0; k--){
1104     v  = aa + 16*ai[k];
1105     xp = x + k*4;
1106     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
1107     nz = ai[k+1] - ai[k];
1108     vj = aj + ai[k];
1109     xp = x + (*vj)*4;
1110     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);      /* Indices for the next row (assumes same size as this one) */
1111     PetscPrefetchBlock(v-16*nz,16*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1112     while (nz--) {
1113       /* xk += U(k,:)*x(:) */
1114       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
1115       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
1116       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
1117       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
1118       vj++;
1119       v += 16; xp = x + (*vj)*4;
1120     }
1121     xp = x + k*4;
1122     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
1123   }
1124   PetscFunctionReturn(0);
1125 }
1126 
1127 #undef __FUNCT__
1128 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1129 PetscErrorCode MatSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1130 {
1131   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1132   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1133   MatScalar      *aa=a->a;
1134   PetscScalar    *x,*b;
1135   PetscErrorCode ierr;
1136 
1137   PetscFunctionBegin;
1138   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1139   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1140 
1141   /* solve U^T * D * y = b by forward substitution */
1142   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
1143   ierr = ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1144 
1145   /* solve U*x = y by back substitution */
1146   ierr = BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1147   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1148   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1149   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1150   PetscFunctionReturn(0);
1151 }
1152 
1153 #undef __FUNCT__
1154 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1155 PetscErrorCode MatForwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1156 {
1157   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1158   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1159   MatScalar      *aa=a->a;
1160   PetscScalar    *x,*b;
1161   PetscErrorCode ierr;
1162 
1163   PetscFunctionBegin;
1164   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1165   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1166   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
1167   ierr = ForwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1168   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1169   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1170   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 #undef __FUNCT__
1175 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace"
1176 PetscErrorCode MatBackwardSolve_SeqSBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1177 {
1178   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1179   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1180   MatScalar      *aa=a->a;
1181   PetscScalar    *x,*b;
1182   PetscErrorCode ierr;
1183 
1184   PetscFunctionBegin;
1185   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1186   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1187   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1188   ierr = BackwardSolve_SeqSBAIJ_4_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1189   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1190   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1191   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
1192   PetscFunctionReturn(0);
1193 }
1194 
1195 #undef __FUNCT__
1196 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_inplace"
1197 PetscErrorCode MatSolve_SeqSBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1198 {
1199   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1200   IS             isrow=a->row;
1201   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1202   PetscErrorCode ierr;
1203   const PetscInt *r;
1204   PetscInt       nz,*vj,k,idx;
1205   MatScalar      *aa=a->a,*v,*diag;
1206   PetscScalar    *x,*b,x0,x1,x2,*t,*tp;
1207 
1208   PetscFunctionBegin;
1209   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1210   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1211   t  = a->solve_work;
1212   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1213 
1214   /* solve U^T * D * y = b by forward substitution */
1215   tp = t;
1216   for (k=0; k<mbs; k++) { /* t <- perm(b) */
1217     idx   = 3*r[k];
1218     tp[0] = b[idx];
1219     tp[1] = b[idx+1];
1220     tp[2] = b[idx+2];
1221     tp += 3;
1222   }
1223 
1224   for (k=0; k<mbs; k++){
1225     v  = aa + 9*ai[k];
1226     vj = aj + ai[k];
1227     tp = t + k*3;
1228     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
1229     nz = ai[k+1] - ai[k];
1230 
1231     tp = t + (*vj)*3;
1232     while (nz--) {
1233       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1234       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1235       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1236       vj++; tp = t + (*vj)*3;
1237       v += 9;
1238     }
1239 
1240     /* xk = inv(Dk)*(Dk*xk) */
1241     diag  = aa+k*9;          /* ptr to inv(Dk) */
1242     tp    = t + k*3;
1243     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1244     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1245     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1246   }
1247 
1248   /* solve U*x = y by back substitution */
1249   for (k=mbs-1; k>=0; k--){
1250     v  = aa + 9*ai[k];
1251     vj = aj + ai[k];
1252     tp    = t + k*3;
1253     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
1254     nz = ai[k+1] - ai[k];
1255 
1256     tp = t + (*vj)*3;
1257     while (nz--) {
1258       /* xk += U(k,:)*x(:) */
1259       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
1260       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
1261       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
1262       vj++; tp = t + (*vj)*3;
1263       v += 9;
1264     }
1265     tp    = t + k*3;
1266     tp[0] = x0; tp[1] = x1; tp[2] = x2;
1267     idx      = 3*r[k];
1268     x[idx]   = x0;
1269     x[idx+1] = x1;
1270     x[idx+2] = x2;
1271   }
1272 
1273   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1274   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1275   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1276   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1277   PetscFunctionReturn(0);
1278 }
1279 
1280 #undef __FUNCT__
1281 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private"
1282 PetscErrorCode ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1283 {
1284   MatScalar      *v,*diag;
1285   PetscScalar    *xp,x0,x1,x2;
1286   PetscInt       nz,*vj,k;
1287 
1288   PetscFunctionBegin;
1289   for (k=0; k<mbs; k++){
1290     v  = aa + 9*ai[k];
1291     xp = x + k*3;
1292     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1293     nz = ai[k+1] - ai[k];
1294     vj = aj + ai[k];
1295     xp = x + (*vj)*3;
1296     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1297     PetscPrefetchBlock(v+9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1298     while (nz--) {
1299       /* x(:) += U(k,:)^T*(Dk*xk) */
1300       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1301       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1302       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1303       vj++; xp = x + (*vj)*3;
1304       v += 9;
1305     }
1306     /* xk = inv(Dk)*(Dk*xk) */
1307     diag = aa+k*9;          /* ptr to inv(Dk) */
1308     xp   = x + k*3;
1309     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1310     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1311     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1312   }
1313   PetscFunctionReturn(0);
1314 }
1315 
1316 #undef __FUNCT__
1317 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private"
1318 PetscErrorCode BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1319 {
1320   MatScalar      *v;
1321   PetscScalar    *xp,x0,x1,x2;
1322   PetscInt       nz,*vj,k;
1323 
1324   PetscFunctionBegin;
1325   for (k=mbs-1; k>=0; k--){
1326     v  = aa + 9*ai[k];
1327     xp = x + k*3;
1328     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1329     nz = ai[k+1] - ai[k];
1330     vj = aj + ai[k];
1331     xp = x + (*vj)*3;
1332     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1333     PetscPrefetchBlock(v-9*nz,9*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1334     while (nz--) {
1335       /* xk += U(k,:)*x(:) */
1336       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1337       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1338       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1339       vj++;
1340       v += 9; xp = x + (*vj)*3;
1341     }
1342     xp = x + k*3;
1343     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1344   }
1345   PetscFunctionReturn(0);
1346 }
1347 
1348 #undef __FUNCT__
1349 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1350 PetscErrorCode MatSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1351 {
1352   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1353   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1354   MatScalar      *aa=a->a;
1355   PetscScalar    *x,*b;
1356   PetscErrorCode ierr;
1357 
1358   PetscFunctionBegin;
1359   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1360   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1361 
1362   /* solve U^T * D * y = b by forward substitution */
1363   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1364   ierr = ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1365 
1366   /* solve U*x = y by back substitution */
1367   ierr = BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1368 
1369   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1370   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1371   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1372   PetscFunctionReturn(0);
1373 }
1374 
1375 #undef __FUNCT__
1376 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1377 PetscErrorCode MatForwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1378 {
1379   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1380   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1381   MatScalar      *aa=a->a;
1382   PetscScalar    *x,*b;
1383   PetscErrorCode ierr;
1384 
1385   PetscFunctionBegin;
1386   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1387   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1388   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1389   ierr = ForwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1390   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1391   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1392   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1393   PetscFunctionReturn(0);
1394 }
1395 
1396 #undef __FUNCT__
1397 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace"
1398 PetscErrorCode MatBackwardSolve_SeqSBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1399 {
1400   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1401   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1402   MatScalar      *aa=a->a;
1403   PetscScalar    *x,*b;
1404   PetscErrorCode ierr;
1405 
1406   PetscFunctionBegin;
1407   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1408   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1409   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1410   ierr = BackwardSolve_SeqSBAIJ_3_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1411   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1412   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1413   ierr = PetscLogFlops(2.0*bs2*(a->nz-mbs));CHKERRQ(ierr);
1414   PetscFunctionReturn(0);
1415 }
1416 
1417 #undef __FUNCT__
1418 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_inplace"
1419 PetscErrorCode MatSolve_SeqSBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
1420 {
1421   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ *)A->data;
1422   IS             isrow=a->row;
1423   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1424   PetscErrorCode ierr;
1425   const PetscInt *r;
1426   PetscInt       nz,*vj,k,k2,idx;
1427   MatScalar      *aa=a->a,*v,*diag;
1428   PetscScalar    *x,*b,x0,x1,*t;
1429 
1430   PetscFunctionBegin;
1431   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1432   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1433   t  = a->solve_work;
1434   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1435 
1436   /* solve U^T * D * y = perm(b) by forward substitution */
1437   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1438     idx = 2*r[k];
1439     t[k*2]   = b[idx];
1440     t[k*2+1] = b[idx+1];
1441   }
1442   for (k=0; k<mbs; k++){
1443     v  = aa + 4*ai[k];
1444     vj = aj + ai[k];
1445     k2 = k*2;
1446     x0 = t[k2]; x1 = t[k2+1];
1447     nz = ai[k+1] - ai[k];
1448     while (nz--) {
1449       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1450       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1451       vj++; v += 4;
1452     }
1453     diag = aa+k*4;  /* ptr to inv(Dk) */
1454     t[k2]   = diag[0]*x0 + diag[2]*x1;
1455     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1456   }
1457 
1458   /* solve U*x = y by back substitution */
1459   for (k=mbs-1; k>=0; k--){
1460     v  = aa + 4*ai[k];
1461     vj = aj + ai[k];
1462     k2 = k*2;
1463     x0 = t[k2]; x1 = t[k2+1];
1464     nz = ai[k+1] - ai[k];
1465     while (nz--) {
1466       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1467       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1468       vj++; v += 4;
1469     }
1470     t[k2]    = x0;
1471     t[k2+1]  = x1;
1472     idx      = 2*r[k];
1473     x[idx]   = x0;
1474     x[idx+1] = x1;
1475   }
1476 
1477   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1478   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1479   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1480   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1481   PetscFunctionReturn(0);
1482 }
1483 
1484 #undef __FUNCT__
1485 #define __FUNCT__ "ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private"
1486 PetscErrorCode ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1487 {
1488   MatScalar      *v,*diag;
1489   PetscScalar    x0,x1;
1490   PetscInt       nz,*vj,k,k2;
1491 
1492   PetscFunctionBegin;
1493   for (k=0; k<mbs; k++){
1494     v  = aa + 4*ai[k];
1495     vj = aj + ai[k];
1496     k2 = k*2;
1497     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1498     nz = ai[k+1] - ai[k];
1499     PetscPrefetchBlock(vj+nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1500     PetscPrefetchBlock(v+4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1501     while (nz--) {
1502       /* x(:) += U(k,:)^T*(Dk*xk) */
1503       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1504       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1505       vj++; v += 4;
1506     }
1507     /* xk = inv(Dk)*(Dk*xk) */
1508     diag = aa+k*4;          /* ptr to inv(Dk) */
1509     x[k2]   = diag[0]*x0 + diag[2]*x1;
1510     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1511   }
1512   PetscFunctionReturn(0);
1513 }
1514 
1515 #undef __FUNCT__
1516 #define __FUNCT__ "BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private"
1517 PetscErrorCode BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(PetscInt *ai,PetscInt *aj,MatScalar *aa,PetscInt mbs,PetscScalar *x)
1518 {
1519   MatScalar      *v;
1520   PetscScalar    x0,x1;
1521   PetscInt       nz,*vj,k,k2;
1522 
1523   PetscFunctionBegin;
1524   for (k=mbs-1; k>=0; k--){
1525     v  = aa + 4*ai[k];
1526     vj = aj + ai[k];
1527     k2 = k*2;
1528     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1529     nz = ai[k+1] - ai[k];
1530     PetscPrefetchBlock(vj-nz,nz,0,PETSC_PREFETCH_HINT_NTA);    /* Indices for the next row (assumes same size as this one) */
1531     PetscPrefetchBlock(v-4*nz,4*nz,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1532     while (nz--) {
1533       /* xk += U(k,:)*x(:) */
1534       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1535       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1536       vj++; v += 4;
1537     }
1538     x[k2]     = x0;
1539     x[k2+1]   = x1;
1540   }
1541   PetscFunctionReturn(0);
1542 }
1543 
1544 #undef __FUNCT__
1545 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1546 PetscErrorCode MatSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1547 {
1548   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1549   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1550   MatScalar      *aa=a->a;
1551   PetscScalar    *x,*b;
1552   PetscErrorCode ierr;
1553 
1554   PetscFunctionBegin;
1555   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1556   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1557 
1558   /* solve U^T * D * y = b by forward substitution */
1559   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1560   ierr = ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1561 
1562   /* solve U*x = y by back substitution */
1563   ierr = BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1564 
1565   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1566   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1567   ierr = PetscLogFlops(4.0*bs2*a->nz - (bs+2.0*bs2)*mbs);CHKERRQ(ierr);
1568   PetscFunctionReturn(0);
1569 }
1570 
1571 #undef __FUNCT__
1572 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1573 PetscErrorCode MatForwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1574 {
1575   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1576   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs=A->rmap->bs,bs2=a->bs2;
1577   MatScalar      *aa=a->a;
1578   PetscScalar    *x,*b;
1579   PetscErrorCode ierr;
1580 
1581   PetscFunctionBegin;
1582   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1583   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1584   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1585   ierr = ForwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1586   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1587   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1588   ierr = PetscLogFlops(2.0*bs2*a->nz - bs*mbs);CHKERRQ(ierr);
1589   PetscFunctionReturn(0);
1590 }
1591 
1592 #undef __FUNCT__
1593 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace"
1594 PetscErrorCode MatBackwardSolve_SeqSBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
1595 {
1596   Mat_SeqSBAIJ   *a=(Mat_SeqSBAIJ*)A->data;
1597   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,bs2=a->bs2;
1598   MatScalar      *aa=a->a;
1599   PetscScalar    *x,*b;
1600   PetscErrorCode ierr;
1601 
1602   PetscFunctionBegin;
1603   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1604   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1605   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1606   ierr = BackwardSolve_SeqSBAIJ_2_NaturalOrdering_private(ai,aj,aa,mbs,x);CHKERRQ(ierr);
1607   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1608   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1609   ierr = PetscLogFlops(2.0*bs2*(a->nz - mbs));CHKERRQ(ierr);
1610   PetscFunctionReturn(0);
1611 }
1612 
1613 #undef __FUNCT__
1614 #define __FUNCT__ "MatSolve_SeqSBAIJ_1"
1615 PetscErrorCode MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1616 {
1617   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1618   IS                isrow=a->row;
1619   PetscErrorCode    ierr;
1620   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1621   const MatScalar   *aa=a->a,*v;
1622   const PetscScalar *b;
1623   PetscScalar       *x,xk,*t;
1624   PetscInt          nz,k,j;
1625 
1626   PetscFunctionBegin;
1627   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1628   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1629   t    = a->solve_work;
1630   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1631 
1632   /* solve U^T*D*y = perm(b) by forward substitution */
1633   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1634   for (k=0; k<mbs; k++){
1635     v  = aa + ai[k];
1636     vj = aj + ai[k];
1637     xk = t[k];
1638     nz = ai[k+1] - ai[k] - 1;
1639     for (j=0; j<nz; j++) t[vj[j]] += v[j]*xk;
1640     t[k] = xk*v[nz];   /* v[nz] = 1/D(k) */
1641   }
1642 
1643   /* solve U*perm(x) = y by back substitution */
1644   for (k=mbs-1; k>=0; k--){
1645     v  = aa + adiag[k] - 1;
1646     vj = aj + adiag[k] - 1;
1647     nz = ai[k+1] - ai[k] - 1;
1648     for (j=0; j<nz; j++) t[k] += v[-j]*t[vj[-j]];
1649     x[rp[k]] = t[k];
1650   }
1651 
1652   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1653   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
1654   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1655   ierr = PetscLogFlops(4.0*a->nz - 3.0*mbs);CHKERRQ(ierr);
1656   PetscFunctionReturn(0);
1657 }
1658 
1659 #undef __FUNCT__
1660 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_inplace"
1661 PetscErrorCode MatSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1662 {
1663   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1664   IS              isrow=a->row;
1665   PetscErrorCode  ierr;
1666   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1667   const MatScalar *aa=a->a,*v;
1668   PetscScalar     *x,*b,xk,*t;
1669   PetscInt        nz,k;
1670 
1671   PetscFunctionBegin;
1672   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1673   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1674   t    = a->solve_work;
1675   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1676 
1677   /* solve U^T*D*y = perm(b) by forward substitution */
1678   for (k=0; k<mbs; k++) t[k] = b[rp[k]];
1679   for (k=0; k<mbs; k++){
1680     v  = aa + ai[k] + 1;
1681     vj = aj + ai[k] + 1;
1682     xk = t[k];
1683     nz = ai[k+1] - ai[k] - 1;
1684     while (nz--) t[*vj++] += (*v++) * xk;
1685     t[k] = xk*aa[ai[k]];  /* aa[k] = 1/D(k) */
1686   }
1687 
1688   /* solve U*perm(x) = y by back substitution */
1689   for (k=mbs-1; k>=0; k--){
1690     v  = aa + ai[k] + 1;
1691     vj = aj + ai[k] + 1;
1692     nz = ai[k+1] - ai[k] - 1;
1693     while (nz--) t[k] += (*v++) * t[*vj++];
1694     x[rp[k]] = t[k];
1695   }
1696 
1697   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1698   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1699   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1700   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
1701   PetscFunctionReturn(0);
1702 }
1703 
1704 #undef __FUNCT__
1705 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1"
1706 PetscErrorCode MatForwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1707 {
1708   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1709   IS              isrow=a->row;
1710   PetscErrorCode  ierr;
1711   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1712   const MatScalar *aa=a->a,*v;
1713   PetscReal       diagk;
1714   PetscScalar     *x,*b,xk;
1715   PetscInt        nz,k;
1716 
1717   PetscFunctionBegin;
1718   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1719   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1720   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1721   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1722 
1723   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1724   for (k=0; k<mbs; k++){
1725     v  = aa + ai[k];
1726     vj = aj + ai[k];
1727     xk = x[k];
1728     nz = ai[k+1] - ai[k] - 1;
1729     while (nz--) x[*vj++] += (*v++) * xk;
1730 
1731     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
1732     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1733     x[k] = xk*sqrt(diagk);
1734   }
1735   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1736   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1737   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1738   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1739   PetscFunctionReturn(0);
1740 }
1741 
1742 #undef __FUNCT__
1743 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_inplace"
1744 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1745 {
1746   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1747   IS              isrow=a->row;
1748   PetscErrorCode  ierr;
1749   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1750   const MatScalar *aa=a->a,*v;
1751   PetscReal       diagk;
1752   PetscScalar     *x,*b,xk;
1753   PetscInt        nz,k;
1754 
1755   PetscFunctionBegin;
1756   /* solve U^T*D^(1/2)*x = perm(b) by forward substitution */
1757   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1758   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1759   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1760 
1761   for (k=0; k<mbs; k++) x[k] = b[rp[k]];
1762   for (k=0; k<mbs; k++){
1763     v  = aa + ai[k] + 1;
1764     vj = aj + ai[k] + 1;
1765     xk = x[k];
1766     nz = ai[k+1] - ai[k] - 1;
1767     while (nz--) x[*vj++] += (*v++) * xk;
1768 
1769     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
1770     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1771     x[k] = xk*sqrt(diagk);
1772   }
1773   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1774   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1775   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1776   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1777   PetscFunctionReturn(0);
1778 }
1779 
1780 #undef __FUNCT__
1781 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1"
1782 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1783 {
1784   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1785   IS              isrow=a->row;
1786   PetscErrorCode  ierr;
1787   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj,*adiag = a->diag;
1788   const MatScalar *aa=a->a,*v;
1789   PetscReal       diagk;
1790   PetscScalar     *x,*b,*t;
1791   PetscInt        nz,k;
1792 
1793   PetscFunctionBegin;
1794   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1795   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1796   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1797   t    = a->solve_work;
1798   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1799 
1800   for (k=mbs-1; k>=0; k--){
1801     v  = aa + ai[k];
1802     vj = aj + ai[k];
1803     diagk = PetscRealPart(aa[adiag[k]]);
1804     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1805     t[k] = b[k] * sqrt(diagk);
1806     nz = ai[k+1] - ai[k] - 1;
1807     while (nz--) t[k] += (*v++) * t[*vj++];
1808     x[rp[k]] = t[k];
1809   }
1810   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1811   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1812   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1813   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1814   PetscFunctionReturn(0);
1815 }
1816 
1817 #undef __FUNCT__
1818 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_inplace"
1819 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
1820 {
1821   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1822   IS              isrow=a->row;
1823   PetscErrorCode  ierr;
1824   const PetscInt  mbs=a->mbs,*ai=a->i,*aj=a->j,*rp,*vj;
1825   const MatScalar *aa=a->a,*v;
1826   PetscReal       diagk;
1827   PetscScalar     *x,*b,*t;
1828   PetscInt        nz,k;
1829 
1830   PetscFunctionBegin;
1831   /* solve D^(1/2)*U*perm(x) = b by back substitution */
1832   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1833   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1834   t    = a->solve_work;
1835   ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1836 
1837   for (k=mbs-1; k>=0; k--){
1838     v  = aa + ai[k] + 1;
1839     vj = aj + ai[k] + 1;
1840     diagk = PetscRealPart(aa[ai[k]]);
1841     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
1842     t[k] = b[k] * sqrt(diagk);
1843     nz = ai[k+1] - ai[k] - 1;
1844     while (nz--) t[k] += (*v++) * t[*vj++];
1845     x[rp[k]] = t[k];
1846   }
1847   ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1848   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1849   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1850   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
1851   PetscFunctionReturn(0);
1852 }
1853 
1854 #undef __FUNCT__
1855 #define __FUNCT__ "MatSolves_SeqSBAIJ_1"
1856 PetscErrorCode MatSolves_SeqSBAIJ_1(Mat A,Vecs bb,Vecs xx)
1857 {
1858   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1859   PetscErrorCode ierr;
1860 
1861   PetscFunctionBegin;
1862   if (A->rmap->bs == 1) {
1863     ierr = MatSolve_SeqSBAIJ_1(A,bb->v,xx->v);CHKERRQ(ierr);
1864   } else {
1865     IS              isrow=a->row;
1866     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1867     const MatScalar *aa=a->a,*v;
1868     PetscScalar     *x,*b,*t;
1869     PetscInt        nz,k,n,i,j;
1870     if (bb->n > a->solves_work_n) {
1871       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1872       ierr = PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);CHKERRQ(ierr);
1873       a->solves_work_n = bb->n;
1874     }
1875     n    = bb->n;
1876     ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr);
1877     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1878     t    = a->solves_work;
1879 
1880     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1881 
1882     /* solve U^T*D*y = perm(b) by forward substitution */
1883     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1884     for (k=0; k<mbs; k++){
1885       v  = aa + ai[k];
1886       vj = aj + ai[k];
1887       nz = ai[k+1] - ai[k] - 1;
1888       for (j=0; j<nz; j++){
1889         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1890         v++;vj++;
1891       }
1892       for (i=0; i<n; i++) t[n*k+i] *= aa[nz];  /* note: aa[nz] = 1/D(k) */
1893     }
1894 
1895     /* solve U*perm(x) = y by back substitution */
1896     for (k=mbs-1; k>=0; k--){
1897       v  = aa + ai[k] - 1;
1898       vj = aj + ai[k] - 1;
1899       nz = ai[k+1] - ai[k] - 1;
1900       for (j=0; j<nz; j++){
1901         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1902         v++;vj++;
1903       }
1904       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1905     }
1906 
1907     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1908     ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr);
1909     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1910     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1911   }
1912   PetscFunctionReturn(0);
1913 }
1914 
1915 #undef __FUNCT__
1916 #define __FUNCT__ "MatSolves_SeqSBAIJ_1_inplace"
1917 PetscErrorCode MatSolves_SeqSBAIJ_1_inplace(Mat A,Vecs bb,Vecs xx)
1918 {
1919   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
1920   PetscErrorCode ierr;
1921 
1922   PetscFunctionBegin;
1923   if (A->rmap->bs == 1) {
1924     ierr = MatSolve_SeqSBAIJ_1_inplace(A,bb->v,xx->v);CHKERRQ(ierr);
1925   } else {
1926     IS              isrow=a->row;
1927     const PetscInt  *vj,mbs=a->mbs,*ai=a->i,*aj=a->j,*rp;
1928     const MatScalar *aa=a->a,*v;
1929     PetscScalar     *x,*b,*t;
1930     PetscInt        nz,k,n,i;
1931     if (bb->n > a->solves_work_n) {
1932       ierr = PetscFree(a->solves_work);CHKERRQ(ierr);
1933       ierr = PetscMalloc(bb->n*A->rmap->N*sizeof(PetscScalar),&a->solves_work);CHKERRQ(ierr);
1934       a->solves_work_n = bb->n;
1935     }
1936     n    = bb->n;
1937     ierr = VecGetArray(bb->v,&b);CHKERRQ(ierr);
1938     ierr = VecGetArray(xx->v,&x);CHKERRQ(ierr);
1939     t    = a->solves_work;
1940 
1941     ierr = ISGetIndices(isrow,&rp);CHKERRQ(ierr);
1942 
1943     /* solve U^T*D*y = perm(b) by forward substitution */
1944     for (k=0; k<mbs; k++) {for (i=0; i<n; i++) t[n*k+i] = b[rp[k]+i*mbs];} /* values are stored interlaced in t */
1945     for (k=0; k<mbs; k++){
1946       v  = aa + ai[k];
1947       vj = aj + ai[k];
1948       nz = ai[k+1] - ai[k];
1949       while (nz--) {
1950         for (i=0; i<n; i++) t[n*(*vj)+i] += (*v) * t[n*k+i];
1951         v++;vj++;
1952       }
1953       for (i=0; i<n; i++) t[n*k+i] *= aa[k];  /* note: aa[k] = 1/D(k) */
1954     }
1955 
1956     /* solve U*perm(x) = y by back substitution */
1957     for (k=mbs-1; k>=0; k--){
1958       v  = aa + ai[k];
1959       vj = aj + ai[k];
1960       nz = ai[k+1] - ai[k];
1961       while (nz--) {
1962         for (i=0; i<n; i++) t[n*k+i] += (*v) * t[n*(*vj)+i];
1963         v++;vj++;
1964       }
1965       for (i=0; i<n; i++) x[rp[k]+i*mbs] = t[n*k+i];
1966     }
1967 
1968     ierr = ISRestoreIndices(isrow,&rp);CHKERRQ(ierr);
1969     ierr = VecRestoreArray(bb->v,&b);CHKERRQ(ierr);
1970     ierr = VecRestoreArray(xx->v,&x);CHKERRQ(ierr);
1971     ierr = PetscLogFlops(bb->n*(4.0*a->nz - 3.0*mbs));CHKERRQ(ierr);
1972   }
1973   PetscFunctionReturn(0);
1974 }
1975 
1976 #undef __FUNCT__
1977 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering"
1978 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1979 {
1980   Mat_SeqSBAIJ      *a = (Mat_SeqSBAIJ *)A->data;
1981   PetscErrorCode    ierr;
1982   const PetscInt    mbs=a->mbs,*ai=a->i,*aj=a->j,*vj,*adiag = a->diag;
1983   const MatScalar   *aa=a->a,*v;
1984   const PetscScalar *b;
1985   PetscScalar       *x,xi;
1986   PetscInt          nz,i,j;
1987 
1988   PetscFunctionBegin;
1989   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
1990   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1991 
1992   /* solve U^T*D*y = b by forward substitution */
1993   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1994   for (i=0; i<mbs; i++){
1995     v  = aa + ai[i];
1996     vj = aj + ai[i];
1997     xi = x[i];
1998     nz = ai[i+1] - ai[i] - 1; /* exclude diag[i] */
1999     for (j=0; j<nz; j++){
2000       x[vj[j]] += v[j]* xi;
2001     }
2002     x[i] = xi*v[nz];  /* v[nz] = aa[diag[i]] = 1/D(i) */
2003   }
2004 
2005   /* solve U*x = y by backward substitution */
2006   for (i=mbs-2; i>=0; i--){
2007     xi = x[i];
2008     v  = aa + adiag[i] - 1; /* end of row i, excluding diag */
2009     vj = aj + adiag[i] - 1;
2010     nz = ai[i+1] - ai[i] - 1;
2011     for (j=0; j<nz; j++) xi += v[-j]*x[vj[-j]];
2012     x[i] = xi;
2013   }
2014 
2015   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
2016   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2017   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
2018   PetscFunctionReturn(0);
2019 }
2020 
2021 #undef __FUNCT__
2022 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2023 PetscErrorCode MatSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2024 {
2025   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2026   PetscErrorCode ierr;
2027   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2028   MatScalar      *aa=a->a,*v;
2029   PetscScalar    *x,*b,xk;
2030   PetscInt       nz,*vj,k;
2031 
2032   PetscFunctionBegin;
2033   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2034   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2035 
2036   /* solve U^T*D*y = b by forward substitution */
2037   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2038   for (k=0; k<mbs; k++){
2039     v  = aa + ai[k] + 1;
2040     vj = aj + ai[k] + 1;
2041     xk = x[k];
2042     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2043     while (nz--) x[*vj++] += (*v++) * xk;
2044     x[k] = xk*aa[ai[k]];  /* note: aa[diag[k]] = 1/D(k) */
2045   }
2046 
2047   /* solve U*x = y by back substitution */
2048   for (k=mbs-2; k>=0; k--){
2049     v  = aa + ai[k] + 1;
2050     vj = aj + ai[k] + 1;
2051     xk = x[k];
2052     nz = ai[k+1] - ai[k] - 1;
2053     while (nz--) {
2054       xk += (*v++) * x[*vj++];
2055     }
2056     x[k] = xk;
2057   }
2058 
2059   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2060   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2061   ierr = PetscLogFlops(4.0*a->nz - 3*mbs);CHKERRQ(ierr);
2062   PetscFunctionReturn(0);
2063 }
2064 
2065 #undef __FUNCT__
2066 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering"
2067 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2068 {
2069   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
2070   PetscErrorCode  ierr;
2071   PetscInt        mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2072   const MatScalar *aa=a->a,*v;
2073   PetscReal       diagk;
2074   PetscScalar     *x,*b;
2075   PetscInt        nz,*vj,k;
2076 
2077   PetscFunctionBegin;
2078   /* solve U^T*D^(1/2)*x = b by forward substitution */
2079   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2080   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2081   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2082   for (k=0; k<mbs; k++){
2083     v  = aa + ai[k];
2084     vj = aj + ai[k];
2085     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2086     while (nz--) x[*vj++] += (*v++) * x[k];
2087     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[adiag[k]] = 1/D(k) */
2088     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[adiag[k]]),PetscImaginaryPart(aa[adiag[k]]));
2089     x[k] *= sqrt(diagk);
2090   }
2091   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2092   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2093   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2094   PetscFunctionReturn(0);
2095 }
2096 
2097 #undef __FUNCT__
2098 #define __FUNCT__ "MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2099 PetscErrorCode MatForwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2100 {
2101   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2102   PetscErrorCode ierr;
2103   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2104   MatScalar      *aa=a->a,*v;
2105   PetscReal      diagk;
2106   PetscScalar    *x,*b;
2107   PetscInt       nz,*vj,k;
2108 
2109   PetscFunctionBegin;
2110   /* solve U^T*D^(1/2)*x = b by forward substitution */
2111   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2112   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2113   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
2114   for (k=0; k<mbs; k++){
2115     v  = aa + ai[k] + 1;
2116     vj = aj + ai[k] + 1;
2117     nz = ai[k+1] - ai[k] - 1;     /* exclude diag[k] */
2118     while (nz--) x[*vj++] += (*v++) * x[k];
2119     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2120     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal (%g,%g) must be real and nonnegative",PetscRealPart(aa[ai[k]]),PetscImaginaryPart(aa[ai[k]]));
2121     x[k] *= sqrt(diagk);
2122   }
2123   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2124   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2125   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2126   PetscFunctionReturn(0);
2127 }
2128 
2129 #undef __FUNCT__
2130 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering"
2131 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
2132 {
2133   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2134   PetscErrorCode ierr;
2135   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j,*adiag = a->diag;
2136   MatScalar      *aa=a->a,*v;
2137   PetscReal      diagk;
2138   PetscScalar    *x,*b;
2139   PetscInt       nz,*vj,k;
2140 
2141   PetscFunctionBegin;
2142   /* solve D^(1/2)*U*x = b by back substitution */
2143   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2144   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2145 
2146   for (k=mbs-1; k>=0; k--){
2147     v  = aa + ai[k];
2148     vj = aj + ai[k];
2149     diagk = PetscRealPart(aa[adiag[k]]); /* note: aa[diag[k]] = 1/D(k) */
2150     if (PetscImaginaryPart(aa[adiag[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2151     x[k] = sqrt(diagk)*b[k];
2152     nz = ai[k+1] - ai[k] - 1;
2153     while (nz--) x[k] += (*v++) * x[*vj++];
2154   }
2155   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2156   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2157   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2158   PetscFunctionReturn(0);
2159 }
2160 
2161 #undef __FUNCT__
2162 #define __FUNCT__ "MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace"
2163 PetscErrorCode MatBackwardSolve_SeqSBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2164 {
2165   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ *)A->data;
2166   PetscErrorCode ierr;
2167   PetscInt       mbs=a->mbs,*ai=a->i,*aj=a->j;
2168   MatScalar      *aa=a->a,*v;
2169   PetscReal      diagk;
2170   PetscScalar    *x,*b;
2171   PetscInt       nz,*vj,k;
2172 
2173   PetscFunctionBegin;
2174   /* solve D^(1/2)*U*x = b by back substitution */
2175   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2176   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2177 
2178   for (k=mbs-1; k>=0; k--){
2179     v  = aa + ai[k] + 1;
2180     vj = aj + ai[k] + 1;
2181     diagk = PetscRealPart(aa[ai[k]]); /* note: aa[diag[k]] = 1/D(k) */
2182     if (PetscImaginaryPart(aa[ai[k]]) || diagk < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Diagonal must be real and nonnegative");
2183     x[k] = sqrt(diagk)*b[k];
2184     nz = ai[k+1] - ai[k] - 1;
2185     while (nz--) x[k] += (*v++) * x[*vj++];
2186   }
2187   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2188   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2189   ierr = PetscLogFlops(2.0*a->nz - mbs);CHKERRQ(ierr);
2190   PetscFunctionReturn(0);
2191 }
2192 
2193 /* Use Modified Sparse Row storage for u and ju, see Saad pp.85 */
2194 #undef __FUNCT__
2195 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_MSR"
2196 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_MSR(Mat B,Mat A,IS perm,const MatFactorInfo *info)
2197 {
2198   Mat_SeqSBAIJ   *a = (Mat_SeqSBAIJ*)A->data,*b;
2199   PetscErrorCode ierr;
2200   const PetscInt *rip,mbs = a->mbs,*ai = a->i,*aj = a->j;
2201   PetscInt       *jutmp,bs = A->rmap->bs,bs2=a->bs2,i;
2202   PetscInt       m,reallocs = 0,*levtmp;
2203   PetscInt       *prowl,*q,jmin,jmax,juidx,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
2204   PetscInt       incrlev,*lev,shift,prow,nz;
2205   PetscReal      f = info->fill,levels = info->levels;
2206   PetscBool      perm_identity;
2207 
2208   PetscFunctionBegin;
2209   /* check whether perm is the identity mapping */
2210   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2211 
2212   if (perm_identity){
2213     a->permute = PETSC_FALSE;
2214     ai = a->i; aj = a->j;
2215   } else { /*  non-trivial permutation */
2216     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2217     a->permute = PETSC_TRUE;
2218     ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr);
2219     ai = a->inew; aj = a->jnew;
2220   }
2221 
2222   /* initialization */
2223   ierr  = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2224   umax  = (PetscInt)(f*ai[mbs] + 1);
2225   ierr  = PetscMalloc(umax*sizeof(PetscInt),&lev);CHKERRQ(ierr);
2226   umax += mbs + 1;
2227   shift = mbs + 1;
2228   ierr  = PetscMalloc((mbs+1)*sizeof(PetscInt),&iu);CHKERRQ(ierr);
2229   ierr  = PetscMalloc(umax*sizeof(PetscInt),&ju);CHKERRQ(ierr);
2230   iu[0] = mbs + 1;
2231   juidx = mbs + 1;
2232   /* prowl: linked list for pivot row */
2233   ierr    = PetscMalloc3(mbs,PetscInt,&prowl,mbs,PetscInt,&q,mbs,PetscInt,&levtmp);CHKERRQ(ierr);
2234   /* q: linked list for col index */
2235 
2236   for (i=0; i<mbs; i++){
2237     prowl[i] = mbs;
2238     q[i] = 0;
2239   }
2240 
2241   /* for each row k */
2242   for (k=0; k<mbs; k++){
2243     nzk  = 0;
2244     q[k] = mbs;
2245     /* copy current row into linked list */
2246     nz = ai[rip[k]+1] - ai[rip[k]];
2247     j = ai[rip[k]];
2248     while (nz--){
2249       vj = rip[aj[j++]];
2250       if (vj > k){
2251         qm = k;
2252         do {
2253           m  = qm; qm = q[m];
2254         } while(qm < vj);
2255         if (qm == vj) {
2256           SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Duplicate entry in A\n");
2257         }
2258         nzk++;
2259         q[m]       = vj;
2260         q[vj]      = qm;
2261         levtmp[vj] = 0;   /* initialize lev for nonzero element */
2262       }
2263     }
2264 
2265     /* modify nonzero structure of k-th row by computing fill-in
2266        for each row prow to be merged in */
2267     prow = k;
2268     prow = prowl[prow]; /* next pivot row (== 0 for symbolic factorization) */
2269 
2270     while (prow < k){
2271       /* merge row prow into k-th row */
2272       jmin = iu[prow] + 1;
2273       jmax = iu[prow+1];
2274       qm = k;
2275       for (j=jmin; j<jmax; j++){
2276         incrlev = lev[j-shift] + 1;
2277 	if (incrlev > levels) continue;
2278         vj      = ju[j];
2279         do {
2280           m = qm; qm = q[m];
2281         } while (qm < vj);
2282         if (qm != vj){      /* a fill */
2283           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
2284           levtmp[vj] = incrlev;
2285         } else {
2286           if (levtmp[vj] > incrlev) levtmp[vj] = incrlev;
2287         }
2288       }
2289       prow = prowl[prow]; /* next pivot row */
2290     }
2291 
2292     /* add k to row list for first nonzero element in k-th row */
2293     if (nzk > 1){
2294       i = q[k]; /* col value of first nonzero element in k_th row of U */
2295       prowl[k] = prowl[i]; prowl[i] = k;
2296     }
2297     iu[k+1] = iu[k] + nzk;
2298 
2299     /* allocate more space to ju and lev if needed */
2300     if (iu[k+1] > umax) {
2301       /* estimate how much additional space we will need */
2302       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2303       /* just double the memory each time */
2304       maxadd = umax;
2305       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
2306       umax += maxadd;
2307 
2308       /* allocate a longer ju */
2309       ierr     = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
2310       ierr     = PetscMemcpy(jutmp,ju,iu[k]*sizeof(PetscInt));CHKERRQ(ierr);
2311       ierr     = PetscFree(ju);CHKERRQ(ierr);
2312       ju       = jutmp;
2313 
2314       ierr     = PetscMalloc(umax*sizeof(PetscInt),&jutmp);CHKERRQ(ierr);
2315       ierr     = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(PetscInt));CHKERRQ(ierr);
2316       ierr     = PetscFree(lev);CHKERRQ(ierr);
2317       lev      = jutmp;
2318       reallocs += 2; /* count how many times we realloc */
2319     }
2320 
2321     /* save nonzero structure of k-th row in ju */
2322     i=k;
2323     while (nzk --) {
2324       i                = q[i];
2325       ju[juidx]        = i;
2326       lev[juidx-shift] = levtmp[i];
2327       juidx++;
2328     }
2329   }
2330 
2331 #if defined(PETSC_USE_INFO)
2332   if (ai[mbs] != 0) {
2333     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2334     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
2335     ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
2336     ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
2337     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
2338   } else {
2339     ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2340   }
2341 #endif
2342 
2343   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2344   ierr = PetscFree3(prowl,q,levtmp);CHKERRQ(ierr);
2345   ierr = PetscFree(lev);CHKERRQ(ierr);
2346 
2347   /* put together the new matrix */
2348   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(B,bs,0,PETSC_NULL);CHKERRQ(ierr);
2349 
2350   /* ierr = PetscLogObjectParent(B,iperm);CHKERRQ(ierr); */
2351   b    = (Mat_SeqSBAIJ*)(B)->data;
2352   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2353   b->singlemalloc = PETSC_FALSE;
2354   b->free_a       = PETSC_TRUE;
2355   b->free_ij      = PETSC_TRUE;
2356   /* the next line frees the default space generated by the Create() */
2357   ierr    = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
2358   ierr    = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
2359   b->j    = ju;
2360   b->i    = iu;
2361   b->diag = 0;
2362   b->ilen = 0;
2363   b->imax = 0;
2364 
2365   if (b->row) {
2366     ierr = ISDestroy(b->row);CHKERRQ(ierr);
2367   }
2368   if (b->icol) {
2369     ierr = ISDestroy(b->icol);CHKERRQ(ierr);
2370   }
2371   b->row  = perm;
2372   b->icol = perm;
2373   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2374   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2375   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2376   /* In b structure:  Free imax, ilen, old a, old j.
2377      Allocate idnew, solve_work, new a, new j */
2378   ierr    = PetscLogObjectMemory(B,(iu[mbs]-mbs)*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2379   b->maxnz = b->nz = iu[mbs];
2380 
2381   (B)->info.factor_mallocs    = reallocs;
2382   (B)->info.fill_ratio_given  = f;
2383   if (ai[mbs] != 0) {
2384     (B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
2385   } else {
2386     (B)->info.fill_ratio_needed = 0.0;
2387   }
2388   ierr = MatSeqSBAIJSetNumericFactorization_inplace(B,perm_identity);CHKERRQ(ierr);
2389   PetscFunctionReturn(0);
2390 }
2391 
2392 /*
2393   See MatICCFactorSymbolic_SeqAIJ() for description of its data structure
2394 */
2395 #include "petscbt.h"
2396 #include "../src/mat/utils/freespace.h"
2397 #undef __FUNCT__
2398 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ"
2399 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2400 {
2401   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data,*b;
2402   PetscErrorCode     ierr;
2403   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2404   PetscInt           bs=A->rmap->bs,am=a->mbs,d,*ai=a->i,*aj= a->j;
2405   const PetscInt     *rip;
2406   PetscInt           reallocs=0,i,*ui,*udiag,*cols;
2407   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2408   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*uj,**uj_ptr,**uj_lvl_ptr;
2409   PetscReal          fill=info->fill,levels=info->levels;
2410   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2411   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2412   PetscBT            lnkbt;
2413 
2414   PetscFunctionBegin;
2415   if (bs > 1){
2416     ierr = MatICCFactorSymbolic_SeqSBAIJ_inplace(fact,A,perm,info);CHKERRQ(ierr);
2417     PetscFunctionReturn(0);
2418   }
2419   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
2420   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
2421   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2422 
2423   /* check whether perm is the identity mapping */
2424   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2425   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2426   a->permute = PETSC_FALSE;
2427 
2428   ierr = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
2429   ierr = PetscMalloc((am+1)*sizeof(PetscInt),&udiag);CHKERRQ(ierr);
2430   ui[0] = 0;
2431 
2432   /* ICC(0) without matrix ordering: simply rearrange column indices */
2433   if (!levels){
2434     /* reuse the column pointers and row offsets for memory savings */
2435     for (i=0; i<am; i++) {
2436       ncols    = ai[i+1] - ai[i];
2437       ui[i+1]  = ui[i] + ncols;
2438       udiag[i] = ui[i+1] - 1; /* points to the last entry of U(i,:) */
2439     }
2440     ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
2441     cols = uj;
2442     for (i=0; i<am; i++) {
2443       aj    = a->j + ai[i] + 1; /* 1st entry of U(i,:) without diagonal */
2444       ncols = ai[i+1] - ai[i] -1;
2445       for (j=0; j<ncols; j++) *cols++ = aj[j];
2446       *cols++ = i; /* diagoanl is located as the last entry of U(i,:) */
2447     }
2448   } else { /* case: levels>0 */
2449     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2450 
2451     /* initialization */
2452     /* jl: linked list for storing indices of the pivot rows
2453        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2454     ierr = PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr);
2455     for (i=0; i<am; i++){
2456       jl[i] = am; il[i] = 0;
2457     }
2458 
2459     /* create and initialize a linked list for storing column indices of the active row k */
2460     nlnk = am + 1;
2461     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2462 
2463     /* initial FreeSpace size is fill*(ai[am]+1) */
2464     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
2465     current_space = free_space;
2466     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
2467     current_space_lvl = free_space_lvl;
2468 
2469     for (k=0; k<am; k++){  /* for each active row k */
2470       /* initialize lnk by the column indices of row k */
2471       nzk   = 0;
2472       ncols = ai[k+1] - ai[k];
2473       if (!ncols) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_CH_ZRPVT,"Empty row %D in matrix ",k);
2474       cols  = aj+ai[k];
2475       ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2476       nzk += nlnk;
2477 
2478       /* update lnk by computing fill-in for each pivot row to be merged in */
2479       prow = jl[k]; /* 1st pivot row */
2480 
2481       while (prow < k){
2482         nextprow = jl[prow];
2483 
2484         /* merge prow into k-th row */
2485         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
2486         jmax = ui[prow+1];
2487         ncols = jmax-jmin;
2488         i     = jmin - ui[prow];
2489 
2490         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2491         uj    = uj_lvl_ptr[prow] + i; /* levels of cols */
2492         j     = *(uj - 1);
2493         ierr = PetscICCLLAddSorted(ncols,cols,levels,uj,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2494         nzk += nlnk;
2495 
2496         /* update il and jl for prow */
2497         if (jmin < jmax){
2498           il[prow] = jmin;
2499           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2500         }
2501         prow = nextprow;
2502       }
2503 
2504       /* if free space is not available, make more free space */
2505       if (current_space->local_remaining<nzk) {
2506         i = am - k + 1; /* num of unfactored rows */
2507         i *= PetscMin(nzk, i-1); /* i*nzk, i*(i-1): estimated and max additional space needed */
2508         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2509         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2510         reallocs++;
2511       }
2512 
2513       /* copy data into free_space and free_space_lvl, then initialize lnk */
2514       if (nzk == 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Empty row %D in ICC matrix factor",k);
2515       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2516 
2517       /* add the k-th row into il and jl */
2518       if (nzk > 1){
2519         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2520         jl[k] = jl[i]; jl[i] = k;
2521         il[k] = ui[k] + 1;
2522       }
2523       uj_ptr[k]     = current_space->array;
2524       uj_lvl_ptr[k] = current_space_lvl->array;
2525 
2526       current_space->array           += nzk;
2527       current_space->local_used      += nzk;
2528       current_space->local_remaining -= nzk;
2529       current_space_lvl->array           += nzk;
2530       current_space_lvl->local_used      += nzk;
2531       current_space_lvl->local_remaining -= nzk;
2532 
2533       ui[k+1] = ui[k] + nzk;
2534     }
2535 
2536     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2537     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2538 
2539     /* destroy list of free space and other temporary array(s) */
2540     ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
2541     ierr = PetscFreeSpaceContiguous_Cholesky(&free_space,uj,am,ui,udiag);CHKERRQ(ierr); /* store matrix factor  */
2542     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2543     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2544 
2545   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2546 
2547   /* put together the new matrix in MATSEQSBAIJ format */
2548   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
2549 
2550   b = (Mat_SeqSBAIJ*)(fact)->data;
2551   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2552   b->singlemalloc = PETSC_FALSE;
2553   b->free_a  = PETSC_TRUE;
2554   b->free_ij = free_ij;
2555   ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
2556   b->j     = uj;
2557   b->i     = ui;
2558   b->diag  = udiag;
2559   b->free_diag = PETSC_TRUE;
2560   b->ilen  = 0;
2561   b->imax  = 0;
2562   b->row   = perm;
2563   b->col   = perm;
2564   ierr     = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2565   ierr     = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2566   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2567   ierr = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2568   ierr = PetscLogObjectMemory(fact,ui[am]*(sizeof(PetscInt)+sizeof(MatScalar)));CHKERRQ(ierr);
2569   b->maxnz = b->nz = ui[am];
2570 
2571   fact->info.factor_mallocs    = reallocs;
2572   fact->info.fill_ratio_given  = fill;
2573   if (ai[am] != 0) {
2574     fact->info.fill_ratio_needed = ((PetscReal)ui[am])/ai[am];
2575   } else {
2576     fact->info.fill_ratio_needed = 0.0;
2577   }
2578 #if defined(PETSC_USE_INFO)
2579     if (ai[am] != 0) {
2580       PetscReal af = fact->info.fill_ratio_needed;
2581       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
2582       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
2583       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
2584     } else {
2585       ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2586     }
2587 #endif
2588   fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
2589   PetscFunctionReturn(0);
2590 }
2591 
2592 #undef __FUNCT__
2593 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ_inplace"
2594 PetscErrorCode MatICCFactorSymbolic_SeqSBAIJ_inplace(Mat fact,Mat A,IS perm,const MatFactorInfo *info)
2595 {
2596   Mat_SeqSBAIJ       *a = (Mat_SeqSBAIJ*)A->data;
2597   Mat_SeqSBAIJ       *b;
2598   PetscErrorCode     ierr;
2599   PetscBool          perm_identity,free_ij = PETSC_TRUE,missing;
2600   PetscInt           bs=A->rmap->bs,am=a->mbs,d;
2601   const PetscInt     *cols,*rip,*ai=a->i,*aj=a->j;
2602   PetscInt           reallocs=0,i,*ui;
2603   PetscInt           jmin,jmax,nzk,k,j,*jl,prow,*il,nextprow;
2604   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL,ncols,*cols_lvl,*uj,**uj_ptr,**uj_lvl_ptr;
2605   PetscReal          fill=info->fill,levels=info->levels,ratio_needed;
2606   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
2607   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
2608   PetscBT            lnkbt;
2609 
2610   PetscFunctionBegin;
2611   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
2612   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
2613 
2614   /*
2615    This code originally uses Modified Sparse Row (MSR) storage
2616    (see page 85, "Iterative Methods ..." by Saad) for the output matrix B - bad choice!
2617    Then it is rewritten so the factor B takes seqsbaij format. However the associated
2618    MatCholeskyFactorNumeric_() have not been modified for the cases of bs>1,
2619    thus the original code in MSR format is still used for these cases.
2620    The code below should replace MatICCFactorSymbolic_SeqSBAIJ_MSR() whenever
2621    MatCholeskyFactorNumeric_() is modified for using sbaij symbolic factor.
2622   */
2623   if (bs > 1){
2624     ierr = MatICCFactorSymbolic_SeqSBAIJ_MSR(fact,A,perm,info);CHKERRQ(ierr);
2625     PetscFunctionReturn(0);
2626   }
2627 
2628   /* check whether perm is the identity mapping */
2629   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
2630   if (!perm_identity) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Matrix reordering is not supported for sbaij matrix. Use aij format");
2631   a->permute = PETSC_FALSE;
2632 
2633   /* special case that simply copies fill pattern */
2634   if (!levels ) {
2635     /* reuse the column pointers and row offsets for memory savings */
2636     ui           = a->i;
2637     uj           = a->j;
2638     free_ij      = PETSC_FALSE;
2639     ratio_needed = 1.0;
2640   } else { /* case: levels>0 */
2641     ierr = ISGetIndices(perm,&rip);CHKERRQ(ierr);
2642 
2643     /* initialization */
2644     ierr  = PetscMalloc((am+1)*sizeof(PetscInt),&ui);CHKERRQ(ierr);
2645     ui[0] = 0;
2646 
2647     /* jl: linked list for storing indices of the pivot rows
2648        il: il[i] points to the 1st nonzero entry of U(i,k:am-1) */
2649     ierr = PetscMalloc4(am,PetscInt*,&uj_ptr,am,PetscInt*,&uj_lvl_ptr,am,PetscInt,&il,am,PetscInt,&jl);CHKERRQ(ierr);
2650     for (i=0; i<am; i++){
2651       jl[i] = am; il[i] = 0;
2652     }
2653 
2654     /* create and initialize a linked list for storing column indices of the active row k */
2655     nlnk = am + 1;
2656     ierr = PetscIncompleteLLCreate(am,am,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2657 
2658     /* initial FreeSpace size is fill*(ai[am]+1) */
2659     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space);CHKERRQ(ierr);
2660     current_space = free_space;
2661     ierr = PetscFreeSpaceGet((PetscInt)(fill*(ai[am]+1)),&free_space_lvl);CHKERRQ(ierr);
2662     current_space_lvl = free_space_lvl;
2663 
2664     for (k=0; k<am; k++){  /* for each active row k */
2665       /* initialize lnk by the column indices of row rip[k] */
2666       nzk   = 0;
2667       ncols = ai[rip[k]+1] - ai[rip[k]];
2668       cols  = aj+ai[rip[k]];
2669       ierr = PetscIncompleteLLInit(ncols,cols,am,rip,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
2670       nzk += nlnk;
2671 
2672       /* update lnk by computing fill-in for each pivot row to be merged in */
2673       prow = jl[k]; /* 1st pivot row */
2674 
2675       while (prow < k){
2676         nextprow = jl[prow];
2677 
2678         /* merge prow into k-th row */
2679         jmin = il[prow] + 1;  /* index of the 2nd nzero entry in U(prow,k:am-1) */
2680         jmax = ui[prow+1];
2681         ncols = jmax-jmin;
2682         i     = jmin - ui[prow];
2683         cols  = uj_ptr[prow] + i; /* points to the 2nd nzero entry in U(prow,k:am-1) */
2684         j     = *(uj_lvl_ptr[prow] + i - 1);
2685         cols_lvl = uj_lvl_ptr[prow]+i;
2686         ierr = PetscICCLLAddSorted(ncols,cols,levels,cols_lvl,am,nlnk,lnk,lnk_lvl,lnkbt,j);CHKERRQ(ierr);
2687         nzk += nlnk;
2688 
2689         /* update il and jl for prow */
2690         if (jmin < jmax){
2691           il[prow] = jmin;
2692           j = *cols; jl[prow] = jl[j]; jl[j] = prow;
2693         }
2694         prow = nextprow;
2695       }
2696 
2697       /* if free space is not available, make more free space */
2698       if (current_space->local_remaining<nzk) {
2699         i = am - k + 1; /* num of unfactored rows */
2700         i = PetscMin(i*nzk, i*(i-1)); /* i*nzk, i*(i-1): estimated and max additional space needed */
2701         ierr = PetscFreeSpaceGet(i,&current_space);CHKERRQ(ierr);
2702         ierr = PetscFreeSpaceGet(i,&current_space_lvl);CHKERRQ(ierr);
2703         reallocs++;
2704       }
2705 
2706       /* copy data into free_space and free_space_lvl, then initialize lnk */
2707       ierr = PetscIncompleteLLClean(am,am,nzk,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
2708 
2709       /* add the k-th row into il and jl */
2710       if (nzk-1 > 0){
2711         i = current_space->array[1]; /* col value of the first nonzero element in U(k, k+1:am-1) */
2712         jl[k] = jl[i]; jl[i] = k;
2713         il[k] = ui[k] + 1;
2714       }
2715       uj_ptr[k]     = current_space->array;
2716       uj_lvl_ptr[k] = current_space_lvl->array;
2717 
2718       current_space->array           += nzk;
2719       current_space->local_used      += nzk;
2720       current_space->local_remaining -= nzk;
2721       current_space_lvl->array           += nzk;
2722       current_space_lvl->local_used      += nzk;
2723       current_space_lvl->local_remaining -= nzk;
2724 
2725       ui[k+1] = ui[k] + nzk;
2726     }
2727 
2728     ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
2729     ierr = PetscFree4(uj_ptr,uj_lvl_ptr,il,jl);CHKERRQ(ierr);
2730 
2731     /* destroy list of free space and other temporary array(s) */
2732     ierr = PetscMalloc((ui[am]+1)*sizeof(PetscInt),&uj);CHKERRQ(ierr);
2733     ierr = PetscFreeSpaceContiguous(&free_space,uj);CHKERRQ(ierr);
2734     ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
2735     ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
2736     if (ai[am] != 0) {
2737       ratio_needed = ((PetscReal)ui[am])/((PetscReal)ai[am]);
2738     } else {
2739       ratio_needed = 0.0;
2740     }
2741   } /* end of case: levels>0 || (levels=0 && !perm_identity) */
2742 
2743   /* put together the new matrix in MATSEQSBAIJ format */
2744   ierr = MatSeqSBAIJSetPreallocation_SeqSBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
2745 
2746   b = (Mat_SeqSBAIJ*)(fact)->data;
2747   ierr = PetscFree2(b->imax,b->ilen);CHKERRQ(ierr);
2748   b->singlemalloc = PETSC_FALSE;
2749   b->free_a  = PETSC_TRUE;
2750   b->free_ij = free_ij;
2751   ierr = PetscMalloc((ui[am]+1)*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
2752   b->j    = uj;
2753   b->i    = ui;
2754   b->diag    = 0;
2755   b->ilen    = 0;
2756   b->imax    = 0;
2757   b->row     = perm;
2758   b->pivotinblocks = PETSC_FALSE; /* need to get from MatFactorInfo */
2759   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2760   b->icol = perm;
2761   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
2762   ierr    = PetscMalloc((am+1)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2763   b->maxnz = b->nz = ui[am];
2764 
2765   fact->info.factor_mallocs    = reallocs;
2766   fact->info.fill_ratio_given  = fill;
2767   fact->info.fill_ratio_needed = ratio_needed;
2768 #if defined(PETSC_USE_INFO)
2769     if (ai[am] != 0) {
2770       PetscReal af = fact->info.fill_ratio_needed;
2771       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,fill,af);CHKERRQ(ierr);
2772       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
2773       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G) for best performance.\n",af);CHKERRQ(ierr);
2774     } else {
2775       ierr = PetscInfo(A,"Empty matrix.\n");CHKERRQ(ierr);
2776     }
2777 #endif
2778   if (perm_identity){
2779     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering_inplace;
2780   } else {
2781     fact->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_inplace;
2782   }
2783   PetscFunctionReturn(0);
2784 }
2785 
2786