xref: /petsc/src/mat/impls/sbaij/seq/sbaijfact2.c (revision 8e1adb73be8dcf1f76cfdcbcf4221d109e3076ce)
1 /*$Id: sbaijfact2.c,v 1.2.1.41 2001/08/07 03:03:01 balay Exp $*/
2 /*
3     Factorization code for SBAIJ format.
4 */
5 
6 #include "src/mat/impls/sbaij/seq/sbaij.h"
7 #include "src/mat/impls/baij/seq/baij.h"
8 #include "src/vec/vecimpl.h"
9 #include "src/inline/ilu.h"
10 #include "src/inline/dot.h"
11 
12 #undef __FUNCT__
13 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering"
14 int MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
15 {
16   PetscFunctionBegin;
17   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
18 }
19 
20 #undef __FUNCT__
21 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering"
22 int MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
23 {
24   PetscFunctionBegin;
25   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
26 }
27 
28 #undef __FUNCT__
29 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering"
30 int MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
31 {
32   PetscFunctionBegin;
33   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
34 }
35 
36 #undef __FUNCT__
37 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering"
38 int MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
39 {
40   PetscFunctionBegin;
41   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
42 }
43 
44 #undef __FUNCT__
45 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering"
46 int MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
47 {
48   PetscFunctionBegin;
49   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
50 }
51 
52 #undef __FUNCT__
53 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering"
54 int MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
55 {
56   PetscFunctionBegin;
57   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
58 }
59 
60 #undef __FUNCT__
61 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering"
62 int MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
63 {
64   PetscFunctionBegin;
65   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
66 }
67 
68 #undef __FUNCT__
69 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_1"
70 int MatSolveTranspose_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
71 {
72   PetscFunctionBegin;
73   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
74 }
75 
76 #undef __FUNCT__
77 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_2"
78 int MatSolveTranspose_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
79 {
80   PetscFunctionBegin;
81   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
82 }
83 
84 #undef __FUNCT__
85 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_3"
86 int MatSolveTranspose_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
87 {
88   PetscFunctionBegin;
89   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
90 }
91 
92 #undef __FUNCT__
93 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_4"
94 int MatSolveTranspose_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
95 {
96   PetscFunctionBegin;
97   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
98 }
99 
100 #undef __FUNCT__
101 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_5"
102 int MatSolveTranspose_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
103 {
104   PetscFunctionBegin;
105   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
106 }
107 
108 #undef __FUNCT__
109 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_6"
110 int MatSolveTranspose_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
111 {
112   PetscFunctionBegin;
113   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
114 }
115 
116 #undef __FUNCT__
117 #define __FUNCT__ "MatSolveTranspose_SeqSBAIJ_7"
118 int MatSolveTranspose_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
119 {
120   PetscFunctionBegin;
121   SETERRQ(1,"Function not needed for SBAIJ format. Call MatSolve().");
122 }
123 
124 #undef __FUNCT__
125 #define __FUNCT__ "MatSolve_SeqSBAIJ_N"
126 int MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx)
127 {
128   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
129   IS              isrow=a->row;
130   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
131   int             nz,*vj,k,ierr,*r,idx,k1;
132   int             bs=a->bs,bs2 = a->bs2;
133   MatScalar       *aa=a->a,*v,*diag;
134   PetscScalar     *x,*xk,*xj,*b,*xk_tmp,*t;
135 
136   PetscFunctionBegin;
137   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
138   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
139   t  = a->solve_work;
140   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
141   ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr);
142 
143   /* solve U^T * D * y = b by forward substitution */
144   xk = t;
145   for (k=0; k<mbs; k++) { /* t <- perm(b) */
146     idx   = bs*r[k];
147     for (k1=0; k1<bs; k1++) *xk++ = b[idx+k1];
148   }
149   for (k=0; k<mbs; k++){
150     v  = aa + bs2*ai[k];
151     xk = t + k*bs;      /* Dk*xk = k-th block of x */
152     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
153     nz = ai[k+1] - ai[k];
154     vj = aj + ai[k];
155     xj = t + (*vj)*bs;  /* *vj-th block of x, *vj>k */
156     while (nz--) {
157       /* x(:) += U(k,:)^T*(Dk*xk) */
158       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
159       vj++; xj = t + (*vj)*bs;
160       v += bs2;
161     }
162     /* xk = inv(Dk)*(Dk*xk) */
163     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
164     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
165   }
166 
167   /* solve U*x = y by back substitution */
168   for (k=mbs-1; k>=0; k--){
169     v  = aa + bs2*ai[k];
170     xk = t + k*bs;        /* xk */
171     nz = ai[k+1] - ai[k];
172     vj = aj + ai[k];
173     xj = t + (*vj)*bs;
174     while (nz--) {
175       /* xk += U(k,:)*x(:) */
176       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
177       vj++;
178       v += bs2; xj = t + (*vj)*bs;
179     }
180     idx   = bs*r[k];
181     for (k1=0; k1<bs; k1++) x[idx+k1] = *xk++;
182   }
183 
184   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
185   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
186   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
187   PetscLogFlops(bs2*(2*a->s_nz + mbs));
188   PetscFunctionReturn(0);
189 }
190 
191 #undef __FUNCT__
192 #define __FUNCT__ "MatSolve_SeqSBAIJ_N_NaturalOrdering"
193 int MatSolve_SeqSBAIJ_N_NaturalOrdering(Mat A,Vec bb,Vec xx)
194 {
195   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
196   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
197   int             nz,*vj,k,ierr;
198   int             bs=a->bs,bs2 = a->bs2;
199   MatScalar       *aa=a->a,*v,*diag;
200   PetscScalar     *x,*xk,*xj,*b,*xk_tmp;
201 
202   PetscFunctionBegin;
203 
204   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
205   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
206 
207   ierr = PetscMalloc(bs*sizeof(PetscScalar),&xk_tmp);CHKERRQ(ierr);
208 
209   /* solve U^T * D * y = b by forward substitution */
210   ierr = PetscMemcpy(x,b,bs*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
211   for (k=0; k<mbs; k++){
212     v  = aa + bs2*ai[k];
213     xk = x + k*bs;      /* Dk*xk = k-th block of x */
214     ierr = PetscMemcpy(xk_tmp,xk,bs*sizeof(PetscScalar));CHKERRQ(ierr); /* xk_tmp <- xk */
215     nz = ai[k+1] - ai[k];
216     vj = aj + ai[k];
217     xj = x + (*vj)*bs;  /* *vj-th block of x, *vj>k */
218     while (nz--) {
219       /* x(:) += U(k,:)^T*(Dk*xk) */
220       Kernel_v_gets_v_plus_Atranspose_times_w(bs,xj,v,xk_tmp); /* xj <- xj + v^t * xk */
221       vj++; xj = x + (*vj)*bs;
222       v += bs2;
223     }
224     /* xk = inv(Dk)*(Dk*xk) */
225     diag = aa+k*bs2;                            /* ptr to inv(Dk) */
226     Kernel_w_gets_A_times_v(bs,xk_tmp,diag,xk); /* xk <- diag * xk */
227   }
228 
229   /* solve U*x = y by back substitution */
230   for (k=mbs-1; k>=0; k--){
231     v  = aa + bs2*ai[k];
232     xk = x + k*bs;        /* xk */
233     nz = ai[k+1] - ai[k];
234     vj = aj + ai[k];
235     xj = x + (*vj)*bs;
236     while (nz--) {
237       /* xk += U(k,:)*x(:) */
238       Kernel_v_gets_v_plus_A_times_w(bs,xk,v,xj); /* xk <- xk + v*xj */
239       vj++;
240       v += bs2; xj = x + (*vj)*bs;
241     }
242   }
243 
244   ierr = PetscFree(xk_tmp);CHKERRQ(ierr);
245   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
246   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
247   PetscLogFlops(bs2*(2*a->s_nz + mbs));
248   PetscFunctionReturn(0);
249 }
250 
251 #undef __FUNCT__
252 #define __FUNCT__ "MatSolve_SeqSBAIJ_7"
253 int MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx)
254 {
255   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
256   IS              isrow=a->row;
257   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
258   int             nz,*vj,k,ierr,*r,idx;
259   MatScalar       *aa=a->a,*v,*d;
260   PetscScalar     *x,*b,x0,x1,x2,x3,x4,x5,x6,*t,*tp;
261 
262   PetscFunctionBegin;
263   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
264   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
265   t  = a->solve_work;
266   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
267 
268   /* solve U^T * D * y = b by forward substitution */
269   tp = t;
270   for (k=0; k<mbs; k++) { /* t <- perm(b) */
271     idx   = 7*r[k];
272     tp[0] = b[idx];
273     tp[1] = b[idx+1];
274     tp[2] = b[idx+2];
275     tp[3] = b[idx+3];
276     tp[4] = b[idx+4];
277     tp[5] = b[idx+5];
278     tp[6] = b[idx+6];
279     tp += 7;
280   }
281 
282   for (k=0; k<mbs; k++){
283     v  = aa + 49*ai[k];
284     vj = aj + ai[k];
285     tp = t + k*7;
286     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5]; x6=tp[6];
287     nz = ai[k+1] - ai[k];
288     tp = t + (*vj)*7;
289     while (nz--) {
290       tp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
291       tp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
292       tp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
293       tp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
294       tp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
295       tp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
296       tp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
297       vj++; tp = t + (*vj)*7;
298       v += 49;
299     }
300 
301     /* xk = inv(Dk)*(Dk*xk) */
302     d  = aa+k*49;          /* ptr to inv(Dk) */
303     tp    = t + k*7;
304     tp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
305     tp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
306     tp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
307     tp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
308     tp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
309     tp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
310     tp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
311   }
312 
313   /* solve U*x = y by back substitution */
314   for (k=mbs-1; k>=0; k--){
315     v  = aa + 49*ai[k];
316     vj = aj + ai[k];
317     tp    = t + k*7;
318     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  x6=tp[6]; /* xk */
319     nz = ai[k+1] - ai[k];
320 
321     tp = t + (*vj)*7;
322     while (nz--) {
323       /* xk += U(k,:)*x(:) */
324       x0 += v[0]*tp[0] + v[7]*tp[1] + v[14]*tp[2] + v[21]*tp[3] + v[28]*tp[4] + v[35]*tp[5] + v[42]*tp[6];
325       x1 += v[1]*tp[0] + v[8]*tp[1] + v[15]*tp[2] + v[22]*tp[3] + v[29]*tp[4] + v[36]*tp[5] + v[43]*tp[6];
326       x2 += v[2]*tp[0] + v[9]*tp[1] + v[16]*tp[2] + v[23]*tp[3] + v[30]*tp[4] + v[37]*tp[5] + v[44]*tp[6];
327       x3 += v[3]*tp[0]+ v[10]*tp[1] + v[17]*tp[2] + v[24]*tp[3] + v[31]*tp[4] + v[38]*tp[5] + v[45]*tp[6];
328       x4 += v[4]*tp[0]+ v[11]*tp[1] + v[18]*tp[2] + v[25]*tp[3] + v[32]*tp[4] + v[39]*tp[5] + v[46]*tp[6];
329       x5 += v[5]*tp[0]+ v[12]*tp[1] + v[19]*tp[2] + v[26]*tp[3] + v[33]*tp[4] + v[40]*tp[5] + v[47]*tp[6];
330       x6 += v[6]*tp[0]+ v[13]*tp[1] + v[20]*tp[2] + v[27]*tp[3] + v[34]*tp[4] + v[41]*tp[5] + v[48]*tp[6];
331       vj++; tp = t + (*vj)*7;
332       v += 49;
333     }
334     tp    = t + k*7;
335     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5; tp[6]=x6;
336     idx   = 7*r[k];
337     x[idx]     = x0;
338     x[idx+1]   = x1;
339     x[idx+2]   = x2;
340     x[idx+3]   = x3;
341     x[idx+4]   = x4;
342     x[idx+5]   = x5;
343     x[idx+6]   = x6;
344   }
345 
346   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
347   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
348   PetscLogFlops(49*(2*a->s_nz + mbs));
349   PetscFunctionReturn(0);
350 }
351 
352 #undef __FUNCT__
353 #define __FUNCT__ "MatSolve_SeqSBAIJ_7_NaturalOrdering"
354 int MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
355 {
356   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
357   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
358   MatScalar       *aa=a->a,*v,*d;
359   PetscScalar     *x,*xp,*b,x0,x1,x2,x3,x4,x5,x6;
360   int             nz,*vj,k,ierr;
361 
362   PetscFunctionBegin;
363 
364   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
365   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
366 
367   /* solve U^T * D * y = b by forward substitution */
368   ierr = PetscMemcpy(x,b,7*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
369   for (k=0; k<mbs; k++){
370     v  = aa + 49*ai[k];
371     xp = x + k*7;
372     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* Dk*xk = k-th block of x */
373     nz = ai[k+1] - ai[k];
374     vj = aj + ai[k];
375     xp = x + (*vj)*7;
376     while (nz--) {
377       /* x(:) += U(k,:)^T*(Dk*xk) */
378       xp[0]+=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5 + v[6]*x6;
379       xp[1]+=  v[7]*x0 +  v[8]*x1 +  v[9]*x2+ v[10]*x3+ v[11]*x4+ v[12]*x5+ v[13]*x6;
380       xp[2]+= v[14]*x0 + v[15]*x1 + v[16]*x2+ v[17]*x3+ v[18]*x4+ v[19]*x5+ v[20]*x6;
381       xp[3]+= v[21]*x0 + v[22]*x1 + v[23]*x2+ v[24]*x3+ v[25]*x4+ v[26]*x5+ v[27]*x6;
382       xp[4]+= v[28]*x0 + v[29]*x1 + v[30]*x2+ v[31]*x3+ v[32]*x4+ v[33]*x5+ v[34]*x6;
383       xp[5]+= v[35]*x0 + v[36]*x1 + v[37]*x2+ v[38]*x3+ v[39]*x4+ v[40]*x5+ v[41]*x6;
384       xp[6]+= v[42]*x0 + v[43]*x1 + v[44]*x2+ v[45]*x3+ v[46]*x4+ v[47]*x5+ v[48]*x6;
385       vj++; xp = x + (*vj)*7;
386       v += 49;
387     }
388     /* xk = inv(Dk)*(Dk*xk) */
389     d  = aa+k*49;          /* ptr to inv(Dk) */
390     xp = x + k*7;
391     xp[0] = d[0]*x0 + d[7]*x1 + d[14]*x2 + d[21]*x3 + d[28]*x4 + d[35]*x5 + d[42]*x6;
392     xp[1] = d[1]*x0 + d[8]*x1 + d[15]*x2 + d[22]*x3 + d[29]*x4 + d[36]*x5 + d[43]*x6;
393     xp[2] = d[2]*x0 + d[9]*x1 + d[16]*x2 + d[23]*x3 + d[30]*x4 + d[37]*x5 + d[44]*x6;
394     xp[3] = d[3]*x0+ d[10]*x1 + d[17]*x2 + d[24]*x3 + d[31]*x4 + d[38]*x5 + d[45]*x6;
395     xp[4] = d[4]*x0+ d[11]*x1 + d[18]*x2 + d[25]*x3 + d[32]*x4 + d[39]*x5 + d[46]*x6;
396     xp[5] = d[5]*x0+ d[12]*x1 + d[19]*x2 + d[26]*x3 + d[33]*x4 + d[40]*x5 + d[47]*x6;
397     xp[6] = d[6]*x0+ d[13]*x1 + d[20]*x2 + d[27]*x3 + d[34]*x4 + d[41]*x5 + d[48]*x6;
398   }
399 
400   /* solve U*x = y by back substitution */
401   for (k=mbs-1; k>=0; k--){
402     v  = aa + 49*ai[k];
403     xp = x + k*7;
404     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; x6=xp[6]; /* xk */
405     nz = ai[k+1] - ai[k];
406     vj = aj + ai[k];
407     xp = x + (*vj)*7;
408     while (nz--) {
409       /* xk += U(k,:)*x(:) */
410       x0 += v[0]*xp[0] + v[7]*xp[1] + v[14]*xp[2] + v[21]*xp[3] + v[28]*xp[4] + v[35]*xp[5] + v[42]*xp[6];
411       x1 += v[1]*xp[0] + v[8]*xp[1] + v[15]*xp[2] + v[22]*xp[3] + v[29]*xp[4] + v[36]*xp[5] + v[43]*xp[6];
412       x2 += v[2]*xp[0] + v[9]*xp[1] + v[16]*xp[2] + v[23]*xp[3] + v[30]*xp[4] + v[37]*xp[5] + v[44]*xp[6];
413       x3 += v[3]*xp[0]+ v[10]*xp[1] + v[17]*xp[2] + v[24]*xp[3] + v[31]*xp[4] + v[38]*xp[5] + v[45]*xp[6];
414       x4 += v[4]*xp[0]+ v[11]*xp[1] + v[18]*xp[2] + v[25]*xp[3] + v[32]*xp[4] + v[39]*xp[5] + v[46]*xp[6];
415       x5 += v[5]*xp[0]+ v[12]*xp[1] + v[19]*xp[2] + v[26]*xp[3] + v[33]*xp[4] + v[40]*xp[5] + v[47]*xp[6];
416       x6 += v[6]*xp[0]+ v[13]*xp[1] + v[20]*xp[2] + v[27]*xp[3] + v[34]*xp[4] + v[41]*xp[5] + v[48]*xp[6];
417       vj++;
418       v += 49; xp = x + (*vj)*7;
419     }
420     xp = x + k*7;
421     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5; xp[6]=x6;
422   }
423 
424   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
425   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
426   PetscLogFlops(49*(2*a->s_nz + mbs));
427   PetscFunctionReturn(0);
428 }
429 
430 #undef __FUNCT__
431 #define __FUNCT__ "MatSolve_SeqSBAIJ_6"
432 int MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx)
433 {
434   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
435   IS              isrow=a->row;
436   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
437   int             nz,*vj,k,ierr,*r,idx;
438   MatScalar       *aa=a->a,*v,*d;
439   PetscScalar     *x,*b,x0,x1,x2,x3,x4,x5,*t,*tp;
440 
441   PetscFunctionBegin;
442   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
443   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
444   t  = a->solve_work;
445   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
446 
447   /* solve U^T * D * y = b by forward substitution */
448   tp = t;
449   for (k=0; k<mbs; k++) { /* t <- perm(b) */
450     idx   = 6*r[k];
451     tp[0] = b[idx];
452     tp[1] = b[idx+1];
453     tp[2] = b[idx+2];
454     tp[3] = b[idx+3];
455     tp[4] = b[idx+4];
456     tp[5] = b[idx+5];
457     tp += 6;
458   }
459 
460   for (k=0; k<mbs; k++){
461     v  = aa + 36*ai[k];
462     vj = aj + ai[k];
463     tp = t + k*6;
464     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];
465     nz = ai[k+1] - ai[k];
466     tp = t + (*vj)*6;
467     while (nz--) {
468       tp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
469       tp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
470       tp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
471       tp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
472       tp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
473       tp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
474       vj++; tp = t + (*vj)*6;
475       v += 36;
476     }
477 
478     /* xk = inv(Dk)*(Dk*xk) */
479     d  = aa+k*36;          /* ptr to inv(Dk) */
480     tp    = t + k*6;
481     tp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
482     tp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
483     tp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
484     tp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
485     tp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
486     tp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
487   }
488 
489   /* solve U*x = y by back substitution */
490   for (k=mbs-1; k>=0; k--){
491     v  = aa + 36*ai[k];
492     vj = aj + ai[k];
493     tp    = t + k*6;
494     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4]; x5=tp[5];  /* xk */
495     nz = ai[k+1] - ai[k];
496 
497     tp = t + (*vj)*6;
498     while (nz--) {
499       /* xk += U(k,:)*x(:) */
500       x0 += v[0]*tp[0] + v[6]*tp[1] + v[12]*tp[2] + v[18]*tp[3] + v[24]*tp[4] + v[30]*tp[5];
501       x1 += v[1]*tp[0] + v[7]*tp[1] + v[13]*tp[2] + v[19]*tp[3] + v[25]*tp[4] + v[31]*tp[5];
502       x2 += v[2]*tp[0] + v[8]*tp[1] + v[14]*tp[2] + v[20]*tp[3] + v[26]*tp[4] + v[32]*tp[5];
503       x3 += v[3]*tp[0] + v[9]*tp[1] + v[15]*tp[2] + v[21]*tp[3] + v[27]*tp[4] + v[33]*tp[5];
504       x4 += v[4]*tp[0]+ v[10]*tp[1] + v[16]*tp[2] + v[22]*tp[3] + v[28]*tp[4] + v[34]*tp[5];
505       x5 += v[5]*tp[0]+ v[11]*tp[1] + v[17]*tp[2] + v[23]*tp[3] + v[29]*tp[4] + v[35]*tp[5];
506       vj++; tp = t + (*vj)*6;
507       v += 36;
508     }
509     tp    = t + k*6;
510     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4; tp[5]=x5;
511     idx   = 6*r[k];
512     x[idx]     = x0;
513     x[idx+1]   = x1;
514     x[idx+2]   = x2;
515     x[idx+3]   = x3;
516     x[idx+4]   = x4;
517     x[idx+5]   = x5;
518   }
519 
520   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
521   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
522   PetscLogFlops(36*(2*a->s_nz + mbs));
523   PetscFunctionReturn(0);
524 }
525 
526 #undef __FUNCT__
527 #define __FUNCT__ "MatSolve_SeqSBAIJ_6_NaturalOrdering"
528 int MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
529 {
530   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
531   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
532   MatScalar       *aa=a->a,*v,*d;
533   PetscScalar     *x,*xp,*b,x0,x1,x2,x3,x4,x5;
534   int             nz,*vj,k,ierr;
535 
536   PetscFunctionBegin;
537 
538   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
539   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
540 
541   /* solve U^T * D * y = b by forward substitution */
542   ierr = PetscMemcpy(x,b,6*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
543   for (k=0; k<mbs; k++){
544     v  = aa + 36*ai[k];
545     xp = x + k*6;
546     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* Dk*xk = k-th block of x */
547     nz = ai[k+1] - ai[k];
548     vj = aj + ai[k];
549     xp = x + (*vj)*6;
550     while (nz--) {
551       /* x(:) += U(k,:)^T*(Dk*xk) */
552       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4 + v[5]*x5;
553       xp[1] +=  v[6]*x0 +  v[7]*x1 +  v[8]*x2 + v[9]*x3+ v[10]*x4+ v[11]*x5;
554       xp[2] += v[12]*x0 + v[13]*x1 + v[14]*x2+ v[15]*x3+ v[16]*x4+ v[17]*x5;
555       xp[3] += v[18]*x0 + v[19]*x1 + v[20]*x2+ v[21]*x3+ v[22]*x4+ v[23]*x5;
556       xp[4] += v[24]*x0 + v[25]*x1 + v[26]*x2+ v[27]*x3+ v[28]*x4+ v[29]*x5;
557       xp[5] += v[30]*x0 + v[31]*x1 + v[32]*x2+ v[33]*x3+ v[34]*x4+ v[35]*x5;
558       vj++; xp = x + (*vj)*6;
559       v += 36;
560     }
561     /* xk = inv(Dk)*(Dk*xk) */
562     d  = aa+k*36;          /* ptr to inv(Dk) */
563     xp = x + k*6;
564     xp[0] = d[0]*x0 + d[6]*x1 + d[12]*x2 + d[18]*x3 + d[24]*x4 + d[30]*x5;
565     xp[1] = d[1]*x0 + d[7]*x1 + d[13]*x2 + d[19]*x3 + d[25]*x4 + d[31]*x5;
566     xp[2] = d[2]*x0 + d[8]*x1 + d[14]*x2 + d[20]*x3 + d[26]*x4 + d[32]*x5;
567     xp[3] = d[3]*x0 + d[9]*x1 + d[15]*x2 + d[21]*x3 + d[27]*x4 + d[33]*x5;
568     xp[4] = d[4]*x0+ d[10]*x1 + d[16]*x2 + d[22]*x3 + d[28]*x4 + d[34]*x5;
569     xp[5] = d[5]*x0+ d[11]*x1 + d[17]*x2 + d[23]*x3 + d[29]*x4 + d[35]*x5;
570   }
571 
572   /* solve U*x = y by back substitution */
573   for (k=mbs-1; k>=0; k--){
574     v  = aa + 36*ai[k];
575     xp = x + k*6;
576     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4]; x5=xp[5]; /* xk */
577     nz = ai[k+1] - ai[k];
578     vj = aj + ai[k];
579     xp = x + (*vj)*6;
580     while (nz--) {
581       /* xk += U(k,:)*x(:) */
582       x0 += v[0]*xp[0] + v[6]*xp[1] + v[12]*xp[2] + v[18]*xp[3] + v[24]*xp[4] + v[30]*xp[5];
583       x1 += v[1]*xp[0] + v[7]*xp[1] + v[13]*xp[2] + v[19]*xp[3] + v[25]*xp[4] + v[31]*xp[5];
584       x2 += v[2]*xp[0] + v[8]*xp[1] + v[14]*xp[2] + v[20]*xp[3] + v[26]*xp[4] + v[32]*xp[5];
585       x3 += v[3]*xp[0] + v[9]*xp[1] + v[15]*xp[2] + v[21]*xp[3] + v[27]*xp[4] + v[33]*xp[5];
586       x4 += v[4]*xp[0]+ v[10]*xp[1] + v[16]*xp[2] + v[22]*xp[3] + v[28]*xp[4] + v[34]*xp[5];
587       x5 += v[5]*xp[0]+ v[11]*xp[1] + v[17]*xp[2] + v[23]*xp[3] + v[29]*xp[4] + v[35]*xp[5];
588       vj++;
589       v += 36; xp = x + (*vj)*6;
590     }
591     xp = x + k*6;
592     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4; xp[5]=x5;
593   }
594 
595   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
596   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
597   PetscLogFlops(36*(2*a->s_nz + mbs));
598   PetscFunctionReturn(0);
599 }
600 
601 #undef __FUNCT__
602 #define __FUNCT__ "MatSolve_SeqSBAIJ_5"
603 int MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx)
604 {
605   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
606   IS              isrow=a->row;
607   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
608   int             nz,*vj,k,ierr,*r,idx;
609   MatScalar       *aa=a->a,*v,*diag;
610   PetscScalar     *x,*b,x0,x1,x2,x3,x4,*t,*tp;
611 
612   PetscFunctionBegin;
613   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
614   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
615   t  = a->solve_work;
616   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
617 
618   /* solve U^T * D * y = b by forward substitution */
619   tp = t;
620   for (k=0; k<mbs; k++) { /* t <- perm(b) */
621     idx   = 5*r[k];
622     tp[0] = b[idx];
623     tp[1] = b[idx+1];
624     tp[2] = b[idx+2];
625     tp[3] = b[idx+3];
626     tp[4] = b[idx+4];
627     tp += 5;
628   }
629 
630   for (k=0; k<mbs; k++){
631     v  = aa + 25*ai[k];
632     vj = aj + ai[k];
633     tp = t + k*5;
634     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];
635     nz = ai[k+1] - ai[k];
636 
637     tp = t + (*vj)*5;
638     while (nz--) {
639       tp[0] +=  v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3 + v[4]*x4;
640       tp[1] +=  v[5]*x0 + v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4;
641       tp[2] += v[10]*x0+ v[11]*x1+ v[12]*x2+ v[13]*x3+ v[14]*x4;
642       tp[3] += v[15]*x0+ v[16]*x1+ v[17]*x2+ v[18]*x3+ v[19]*x4;
643       tp[4] += v[20]*x0+ v[21]*x1+ v[22]*x2+ v[23]*x3+ v[24]*x4;
644       vj++; tp = t + (*vj)*5;
645       v += 25;
646     }
647 
648     /* xk = inv(Dk)*(Dk*xk) */
649     diag  = aa+k*25;          /* ptr to inv(Dk) */
650     tp    = t + k*5;
651       tp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
652       tp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
653       tp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
654       tp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
655       tp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
656   }
657 
658   /* solve U*x = y by back substitution */
659   for (k=mbs-1; k>=0; k--){
660     v  = aa + 25*ai[k];
661     vj = aj + ai[k];
662     tp    = t + k*5;
663     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; x4=tp[4];/* xk */
664     nz = ai[k+1] - ai[k];
665 
666     tp = t + (*vj)*5;
667     while (nz--) {
668       /* xk += U(k,:)*x(:) */
669       x0 += v[0]*tp[0] + v[5]*tp[1] + v[10]*tp[2] + v[15]*tp[3] + v[20]*tp[4];
670       x1 += v[1]*tp[0] + v[6]*tp[1] + v[11]*tp[2] + v[16]*tp[3] + v[21]*tp[4];
671       x2 += v[2]*tp[0] + v[7]*tp[1] + v[12]*tp[2] + v[17]*tp[3] + v[22]*tp[4];
672       x3 += v[3]*tp[0] + v[8]*tp[1] + v[13]*tp[2] + v[18]*tp[3] + v[23]*tp[4];
673       x4 += v[4]*tp[0] + v[9]*tp[1] + v[14]*tp[2] + v[19]*tp[3] + v[24]*tp[4];
674       vj++; tp = t + (*vj)*5;
675       v += 25;
676     }
677     tp    = t + k*5;
678     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3; tp[4]=x4;
679     idx   = 5*r[k];
680     x[idx]     = x0;
681     x[idx+1]   = x1;
682     x[idx+2]   = x2;
683     x[idx+3]   = x3;
684     x[idx+4]   = x4;
685   }
686 
687   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
688   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
689   PetscLogFlops(25*(2*a->s_nz + mbs));
690   PetscFunctionReturn(0);
691 }
692 
693 #undef __FUNCT__
694 #define __FUNCT__ "MatSolve_SeqSBAIJ_5_NaturalOrdering"
695 int MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
696 {
697   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
698   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
699   MatScalar       *aa=a->a,*v,*diag;
700   PetscScalar     *x,*xp,*b,x0,x1,x2,x3,x4;
701   int             nz,*vj,k,ierr;
702 
703   PetscFunctionBegin;
704 
705   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
706   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
707 
708   /* solve U^T * D * y = b by forward substitution */
709   ierr = PetscMemcpy(x,b,5*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
710   for (k=0; k<mbs; k++){
711     v  = aa + 25*ai[k];
712     xp = x + k*5;
713     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* Dk*xk = k-th block of x */
714     nz = ai[k+1] - ai[k];
715     vj = aj + ai[k];
716     xp = x + (*vj)*5;
717     while (nz--) {
718       /* x(:) += U(k,:)^T*(Dk*xk) */
719       xp[0] +=  v[0]*x0 +  v[1]*x1 +  v[2]*x2 + v[3]*x3 + v[4]*x4;
720       xp[1] +=  v[5]*x0 +  v[6]*x1 +  v[7]*x2 + v[8]*x3 + v[9]*x4;
721       xp[2] += v[10]*x0 + v[11]*x1 + v[12]*x2+ v[13]*x3+ v[14]*x4;
722       xp[3] += v[15]*x0 + v[16]*x1 + v[17]*x2+ v[18]*x3+ v[19]*x4;
723       xp[4] += v[20]*x0 + v[21]*x1 + v[22]*x2+ v[23]*x3+ v[24]*x4;
724       vj++; xp = x + (*vj)*5;
725       v += 25;
726     }
727     /* xk = inv(Dk)*(Dk*xk) */
728     diag = aa+k*25;          /* ptr to inv(Dk) */
729     xp   = x + k*5;
730     xp[0] = diag[0]*x0 + diag[5]*x1 + diag[10]*x2 + diag[15]*x3 + diag[20]*x4;
731     xp[1] = diag[1]*x0 + diag[6]*x1 + diag[11]*x2 + diag[16]*x3 + diag[21]*x4;
732     xp[2] = diag[2]*x0 + diag[7]*x1 + diag[12]*x2 + diag[17]*x3 + diag[22]*x4;
733     xp[3] = diag[3]*x0 + diag[8]*x1 + diag[13]*x2 + diag[18]*x3 + diag[23]*x4;
734     xp[4] = diag[4]*x0 + diag[9]*x1 + diag[14]*x2 + diag[19]*x3 + diag[24]*x4;
735   }
736 
737   /* solve U*x = y by back substitution */
738   for (k=mbs-1; k>=0; k--){
739     v  = aa + 25*ai[k];
740     xp = x + k*5;
741     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; x4=xp[4];/* xk */
742     nz = ai[k+1] - ai[k];
743     vj = aj + ai[k];
744     xp = x + (*vj)*5;
745     while (nz--) {
746       /* xk += U(k,:)*x(:) */
747       x0 += v[0]*xp[0] + v[5]*xp[1] + v[10]*xp[2] + v[15]*xp[3] + v[20]*xp[4];
748       x1 += v[1]*xp[0] + v[6]*xp[1] + v[11]*xp[2] + v[16]*xp[3] + v[21]*xp[4];
749       x2 += v[2]*xp[0] + v[7]*xp[1] + v[12]*xp[2] + v[17]*xp[3] + v[22]*xp[4];
750       x3 += v[3]*xp[0] + v[8]*xp[1] + v[13]*xp[2] + v[18]*xp[3] + v[23]*xp[4];
751       x4 += v[4]*xp[0] + v[9]*xp[1] + v[14]*xp[2] + v[19]*xp[3] + v[24]*xp[4];
752       vj++;
753       v += 25; xp = x + (*vj)*5;
754     }
755     xp = x + k*5;
756     xp[0]=x0; xp[1]=x1; xp[2]=x2; xp[3]=x3; xp[4]=x4;
757   }
758 
759   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
760   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
761   PetscLogFlops(25*(2*a->s_nz + mbs));
762   PetscFunctionReturn(0);
763 }
764 
765 #undef __FUNCT__
766 #define __FUNCT__ "MatSolve_SeqSBAIJ_4"
767 int MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx)
768 {
769   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
770   IS              isrow=a->row;
771   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
772   int             nz,*vj,k,ierr,*r,idx;
773   MatScalar       *aa=a->a,*v,*diag;
774   PetscScalar     *x,*b,x0,x1,x2,x3,*t,*tp;
775 
776   PetscFunctionBegin;
777   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
778   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
779   t  = a->solve_work;
780   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
781 
782   /* solve U^T * D * y = b by forward substitution */
783   tp = t;
784   for (k=0; k<mbs; k++) { /* t <- perm(b) */
785     idx   = 4*r[k];
786     tp[0] = b[idx];
787     tp[1] = b[idx+1];
788     tp[2] = b[idx+2];
789     tp[3] = b[idx+3];
790     tp += 4;
791   }
792 
793   for (k=0; k<mbs; k++){
794     v  = aa + 16*ai[k];
795     vj = aj + ai[k];
796     tp = t + k*4;
797     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3];
798     nz = ai[k+1] - ai[k];
799 
800     tp = t + (*vj)*4;
801     while (nz--) {
802       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
803       tp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
804       tp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
805       tp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
806       vj++; tp = t + (*vj)*4;
807       v += 16;
808     }
809 
810     /* xk = inv(Dk)*(Dk*xk) */
811     diag  = aa+k*16;          /* ptr to inv(Dk) */
812     tp    = t + k*4;
813     tp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
814     tp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
815     tp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
816     tp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
817   }
818 
819   /* solve U*x = y by back substitution */
820   for (k=mbs-1; k>=0; k--){
821     v  = aa + 16*ai[k];
822     vj = aj + ai[k];
823     tp    = t + k*4;
824     x0=tp[0]; x1=tp[1]; x2=tp[2]; x3=tp[3]; /* xk */
825     nz = ai[k+1] - ai[k];
826 
827     tp = t + (*vj)*4;
828     while (nz--) {
829       /* xk += U(k,:)*x(:) */
830       x0 += v[0]*tp[0] + v[4]*tp[1] + v[8]*tp[2] + v[12]*tp[3];
831       x1 += v[1]*tp[0] + v[5]*tp[1] + v[9]*tp[2] + v[13]*tp[3];
832       x2 += v[2]*tp[0] + v[6]*tp[1]+ v[10]*tp[2] + v[14]*tp[3];
833       x3 += v[3]*tp[0] + v[7]*tp[1]+ v[11]*tp[2] + v[15]*tp[3];
834       vj++; tp = t + (*vj)*4;
835       v += 16;
836     }
837     tp    = t + k*4;
838     tp[0]=x0; tp[1]=x1; tp[2]=x2; tp[3]=x3;
839     idx        = 4*r[k];
840     x[idx]     = x0;
841     x[idx+1]   = x1;
842     x[idx+2]   = x2;
843     x[idx+3]   = x3;
844   }
845 
846   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
847   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
848   PetscLogFlops(16*(2*a->s_nz + mbs));
849   PetscFunctionReturn(0);
850 }
851 
852 /*
853    Special case where the matrix was factored in the natural ordering.
854    This eliminates the need for the column and row permutation.
855 */
856 #undef __FUNCT__
857 #define __FUNCT__ "MatSolve_SeqSBAIJ_4_NaturalOrdering"
858 int MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
859 {
860   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
861   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
862   MatScalar       *aa=a->a,*v,*diag;
863   PetscScalar     *x,*xp,*b,x0,x1,x2,x3;
864   int             nz,*vj,k,ierr;
865 
866   PetscFunctionBegin;
867 
868   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
869   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
870 
871   /* solve U^T * D * y = b by forward substitution */
872   ierr = PetscMemcpy(x,b,4*mbs*sizeof(PetscScalar));CHKERRQ(ierr); /* x <- b */
873   for (k=0; k<mbs; k++){
874     v  = aa + 16*ai[k];
875     xp = x + k*4;
876     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* Dk*xk = k-th block of x */
877     nz = ai[k+1] - ai[k];
878     vj = aj + ai[k];
879     xp = x + (*vj)*4;
880     while (nz--) {
881       /* x(:) += U(k,:)^T*(Dk*xk) */
882       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2 + v[3]*x3;
883       xp[1] += v[4]*x0 + v[5]*x1 + v[6]*x2 + v[7]*x3;
884       xp[2] += v[8]*x0 + v[9]*x1 + v[10]*x2+ v[11]*x3;
885       xp[3] += v[12]*x0+ v[13]*x1+ v[14]*x2+ v[15]*x3;
886       vj++; xp = x + (*vj)*4;
887       v += 16;
888     }
889     /* xk = inv(Dk)*(Dk*xk) */
890     diag = aa+k*16;          /* ptr to inv(Dk) */
891     xp   = x + k*4;
892     xp[0] = diag[0]*x0 + diag[4]*x1 + diag[8]*x2 + diag[12]*x3;
893     xp[1] = diag[1]*x0 + diag[5]*x1 + diag[9]*x2 + diag[13]*x3;
894     xp[2] = diag[2]*x0 + diag[6]*x1 + diag[10]*x2+ diag[14]*x3;
895     xp[3] = diag[3]*x0 + diag[7]*x1 + diag[11]*x2+ diag[15]*x3;
896   }
897 
898   /* solve U*x = y by back substitution */
899   for (k=mbs-1; k>=0; k--){
900     v  = aa + 16*ai[k];
901     xp = x + k*4;
902     x0=xp[0]; x1=xp[1]; x2=xp[2]; x3=xp[3]; /* xk */
903     nz = ai[k+1] - ai[k];
904     vj = aj + ai[k];
905     xp = x + (*vj)*4;
906     while (nz--) {
907       /* xk += U(k,:)*x(:) */
908       x0 += v[0]*xp[0] + v[4]*xp[1] + v[8]*xp[2] + v[12]*xp[3];
909       x1 += v[1]*xp[0] + v[5]*xp[1] + v[9]*xp[2] + v[13]*xp[3];
910       x2 += v[2]*xp[0] + v[6]*xp[1]+ v[10]*xp[2] + v[14]*xp[3];
911       x3 += v[3]*xp[0] + v[7]*xp[1]+ v[11]*xp[2] + v[15]*xp[3];
912       vj++;
913       v += 16; xp = x + (*vj)*4;
914     }
915     xp = x + k*4;
916     xp[0] = x0; xp[1] = x1; xp[2] = x2; xp[3] = x3;
917   }
918 
919   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
920   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
921   PetscLogFlops(16*(2*a->s_nz + mbs));
922   PetscFunctionReturn(0);
923 }
924 
925 #undef __FUNCT__
926 #define __FUNCT__ "MatSolve_SeqSBAIJ_3"
927 int MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx)
928 {
929   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
930   IS              isrow=a->row;
931   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
932   int             nz,*vj,k,ierr,*r,idx;
933   MatScalar       *aa=a->a,*v,*diag;
934   PetscScalar     *x,*b,x0,x1,x2,*t,*tp;
935 
936   PetscFunctionBegin;
937   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
938   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
939   t  = a->solve_work;
940   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
941 
942   /* solve U^T * D * y = b by forward substitution */
943   tp = t;
944   for (k=0; k<mbs; k++) { /* t <- perm(b) */
945     idx   = 3*r[k];
946     tp[0] = b[idx];
947     tp[1] = b[idx+1];
948     tp[2] = b[idx+2];
949     tp += 3;
950   }
951 
952   for (k=0; k<mbs; k++){
953     v  = aa + 9*ai[k];
954     vj = aj + ai[k];
955     tp = t + k*3;
956     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];
957     nz = ai[k+1] - ai[k];
958 
959     tp = t + (*vj)*3;
960     while (nz--) {
961       tp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
962       tp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
963       tp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
964       vj++; tp = t + (*vj)*3;
965       v += 9;
966     }
967 
968     /* xk = inv(Dk)*(Dk*xk) */
969     diag  = aa+k*9;          /* ptr to inv(Dk) */
970     tp    = t + k*3;
971     tp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
972     tp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
973     tp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
974   }
975 
976   /* solve U*x = y by back substitution */
977   for (k=mbs-1; k>=0; k--){
978     v  = aa + 9*ai[k];
979     vj = aj + ai[k];
980     tp    = t + k*3;
981     x0 = tp[0]; x1 = tp[1]; x2 = tp[2];  /* xk */
982     nz = ai[k+1] - ai[k];
983 
984     tp = t + (*vj)*3;
985     while (nz--) {
986       /* xk += U(k,:)*x(:) */
987       x0 += v[0]*tp[0] + v[3]*tp[1] + v[6]*tp[2];
988       x1 += v[1]*tp[0] + v[4]*tp[1] + v[7]*tp[2];
989       x2 += v[2]*tp[0] + v[5]*tp[1] + v[8]*tp[2];
990       vj++; tp = t + (*vj)*3;
991       v += 9;
992     }
993     tp    = t + k*3;
994     tp[0] = x0; tp[1] = x1; tp[2] = x2;
995     idx      = 3*r[k];
996     x[idx]   = x0;
997     x[idx+1] = x1;
998     x[idx+2] = x2;
999   }
1000 
1001   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1002   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1003   PetscLogFlops(9*(2*a->s_nz + mbs));
1004   PetscFunctionReturn(0);
1005 }
1006 
1007 /*
1008    Special case where the matrix was factored in the natural ordering.
1009    This eliminates the need for the column and row permutation.
1010 */
1011 #undef __FUNCT__
1012 #define __FUNCT__ "MatSolve_SeqSBAIJ_3_NaturalOrdering"
1013 int MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
1014 {
1015   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
1016   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
1017   MatScalar       *aa=a->a,*v,*diag;
1018   PetscScalar     *x,*xp,*b,x0,x1,x2;
1019   int             nz,*vj,k,ierr;
1020 
1021   PetscFunctionBegin;
1022 
1023   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1024   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1025 
1026   /* solve U^T * D * y = b by forward substitution */
1027   ierr = PetscMemcpy(x,b,3*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1028   for (k=0; k<mbs; k++){
1029     v  = aa + 9*ai[k];
1030     xp = x + k*3;
1031     x0 = xp[0]; x1 = xp[1]; x2 = xp[2]; /* Dk*xk = k-th block of x */
1032     nz = ai[k+1] - ai[k];
1033     vj = aj + ai[k];
1034     xp = x + (*vj)*3;
1035     while (nz--) {
1036       /* x(:) += U(k,:)^T*(Dk*xk) */
1037       xp[0] += v[0]*x0 + v[1]*x1 + v[2]*x2;
1038       xp[1] += v[3]*x0 + v[4]*x1 + v[5]*x2;
1039       xp[2] += v[6]*x0 + v[7]*x1 + v[8]*x2;
1040       vj++; xp = x + (*vj)*3;
1041       v += 9;
1042     }
1043     /* xk = inv(Dk)*(Dk*xk) */
1044     diag = aa+k*9;          /* ptr to inv(Dk) */
1045     xp   = x + k*3;
1046     xp[0] = diag[0]*x0 + diag[3]*x1 + diag[6]*x2;
1047     xp[1] = diag[1]*x0 + diag[4]*x1 + diag[7]*x2;
1048     xp[2] = diag[2]*x0 + diag[5]*x1 + diag[8]*x2;
1049   }
1050 
1051   /* solve U*x = y by back substitution */
1052   for (k=mbs-1; k>=0; k--){
1053     v  = aa + 9*ai[k];
1054     xp = x + k*3;
1055     x0 = xp[0]; x1 = xp[1]; x2 = xp[2];  /* xk */
1056     nz = ai[k+1] - ai[k];
1057     vj = aj + ai[k];
1058     xp = x + (*vj)*3;
1059     while (nz--) {
1060       /* xk += U(k,:)*x(:) */
1061       x0 += v[0]*xp[0] + v[3]*xp[1] + v[6]*xp[2];
1062       x1 += v[1]*xp[0] + v[4]*xp[1] + v[7]*xp[2];
1063       x2 += v[2]*xp[0] + v[5]*xp[1] + v[8]*xp[2];
1064       vj++;
1065       v += 9; xp = x + (*vj)*3;
1066     }
1067     xp = x + k*3;
1068     xp[0] = x0; xp[1] = x1; xp[2] = x2;
1069   }
1070 
1071   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1072   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1073   PetscLogFlops(9*(2*a->s_nz + mbs));
1074   PetscFunctionReturn(0);
1075 }
1076 
1077 #undef __FUNCT__
1078 #define __FUNCT__ "MatSolve_SeqSBAIJ_2"
1079 int MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx)
1080 {
1081   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ *)A->data;
1082   IS              isrow=a->row;
1083   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
1084   int             nz,*vj,k,k2,ierr,*r,idx;
1085   MatScalar       *aa=a->a,*v,*diag;
1086   PetscScalar     *x,*b,x0,x1,*t;
1087 
1088   PetscFunctionBegin;
1089   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1090   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1091   t  = a->solve_work;
1092   /* printf("called MatSolve_SeqSBAIJ_2\n"); */
1093   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
1094 
1095   /* solve U^T * D * y = perm(b) by forward substitution */
1096   for (k=0; k<mbs; k++) {  /* t <- perm(b) */
1097     idx = 2*r[k];
1098     t[k*2]   = b[idx];
1099     t[k*2+1] = b[idx+1];
1100   }
1101   for (k=0; k<mbs; k++){
1102     v  = aa + 4*ai[k];
1103     vj = aj + ai[k];
1104     k2 = k*2;
1105     x0 = t[k2]; x1 = t[k2+1];
1106     nz = ai[k+1] - ai[k];
1107     while (nz--) {
1108       t[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1109       t[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1110       vj++; v += 4;
1111     }
1112     diag = aa+k*4;  /* ptr to inv(Dk) */
1113     t[k2]   = diag[0]*x0 + diag[2]*x1;
1114     t[k2+1] = diag[1]*x0 + diag[3]*x1;
1115   }
1116 
1117   /* solve U*x = y by back substitution */
1118   for (k=mbs-1; k>=0; k--){
1119     v  = aa + 4*ai[k];
1120     vj = aj + ai[k];
1121     k2 = k*2;
1122     x0 = t[k2]; x1 = t[k2+1];
1123     nz = ai[k+1] - ai[k];
1124     while (nz--) {
1125       x0 += v[0]*t[(*vj)*2] + v[2]*t[(*vj)*2+1];
1126       x1 += v[1]*t[(*vj)*2] + v[3]*t[(*vj)*2+1];
1127       vj++; v += 4;
1128     }
1129     t[k2]    = x0;
1130     t[k2+1]  = x1;
1131     idx      = 2*r[k];
1132     x[idx]   = x0;
1133     x[idx+1] = x1;
1134   }
1135 
1136   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
1137   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1138   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1139   PetscLogFlops(4*(2*a->s_nz + mbs));
1140   PetscFunctionReturn(0);
1141 }
1142 
1143 /*
1144    Special case where the matrix was factored in the natural ordering.
1145    This eliminates the need for the column and row permutation.
1146 */
1147 #undef __FUNCT__
1148 #define __FUNCT__ "MatSolve_SeqSBAIJ_2_NaturalOrdering"
1149 int MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1150 {
1151   Mat_SeqSBAIJ    *a=(Mat_SeqSBAIJ*)A->data;
1152   int             mbs=a->mbs,*ai=a->i,*aj=a->j;
1153   MatScalar       *aa=a->a,*v,*diag;
1154   PetscScalar     *x,*b,x0,x1;
1155   int             nz,*vj,k,k2,ierr;
1156 
1157   PetscFunctionBegin;
1158 
1159   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1160   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1161 
1162   /* solve U^T * D * y = b by forward substitution */
1163   ierr = PetscMemcpy(x,b,2*mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1164   for (k=0; k<mbs; k++){
1165     v  = aa + 4*ai[k];
1166     vj = aj + ai[k];
1167     k2 = k*2;
1168     x0 = x[k2]; x1 = x[k2+1];  /* Dk*xk = k-th block of x */
1169     nz = ai[k+1] - ai[k];
1170 
1171     while (nz--) {
1172       /* x(:) += U(k,:)^T*(Dk*xk) */
1173       x[(*vj)*2]   += v[0]*x0 + v[1]*x1;
1174       x[(*vj)*2+1] += v[2]*x0 + v[3]*x1;
1175       vj++; v += 4;
1176     }
1177     /* xk = inv(Dk)*(Dk*xk) */
1178     diag = aa+k*4;          /* ptr to inv(Dk) */
1179     x[k2]   = diag[0]*x0 + diag[2]*x1;
1180     x[k2+1] = diag[1]*x0 + diag[3]*x1;
1181   }
1182 
1183   /* solve U*x = y by back substitution */
1184   for (k=mbs-1; k>=0; k--){
1185     v  = aa + 4*ai[k];
1186     vj = aj + ai[k];
1187     k2 = k*2;
1188     x0 = x[k2]; x1 = x[k2+1];  /* xk */
1189     nz = ai[k+1] - ai[k];
1190     while (nz--) {
1191       /* xk += U(k,:)*x(:) */
1192       x0 += v[0]*x[(*vj)*2] + v[2]*x[(*vj)*2+1];
1193       x1 += v[1]*x[(*vj)*2] + v[3]*x[(*vj)*2+1];
1194       vj++; v += 4;
1195     }
1196     x[k2]     = x0;
1197     x[k2+1]   = x1;
1198   }
1199 
1200   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1201   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1202   PetscLogFlops(4*(2*a->s_nz + mbs)); /* bs2*(2*a->s_nz + mbs) */
1203   PetscFunctionReturn(0);
1204 }
1205 
1206 #undef __FUNCT__
1207 #define __FUNCT__ "MatSolve_SeqSBAIJ_1"
1208 int MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx)
1209 {
1210   Mat_SeqSBAIJ     *a = (Mat_SeqSBAIJ *)A->data;
1211   IS              isrow=a->row;
1212   int             mbs=a->mbs,*ai=a->i,*aj=a->j,ierr,*rip;
1213   MatScalar       *aa=a->a,*v;
1214   PetscScalar     *x,*b,xk,*t;
1215   int             nz,*vj,k;
1216 
1217   PetscFunctionBegin;
1218   if (!mbs) PetscFunctionReturn(0);
1219 
1220   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1221   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1222   t = a->solve_work;
1223 
1224   ierr = ISGetIndices(isrow,&rip);CHKERRQ(ierr);
1225 
1226   /* solve U^T*D*y = perm(b) by forward substitution */
1227   for (k=0; k<mbs; k++) t[k] = b[rip[k]];
1228   for (k=0; k<mbs; k++){
1229     v  = aa + ai[k];
1230     vj = aj + ai[k];
1231     xk = t[k];
1232     nz = ai[k+1] - ai[k];
1233     while (nz--) t[*vj++] += (*v++) * xk;
1234     t[k] = xk*aa[k];  /* note: aa[k] = 1/D(k) */
1235   }
1236 
1237   /* solve U*x = y by back substitution */
1238   for (k=mbs-1; k>=0; k--){
1239     v  = aa + ai[k];
1240     vj = aj + ai[k];
1241     xk = t[k];
1242     nz = ai[k+1] - ai[k];
1243     while (nz--) xk += (*v++) * t[*vj++];
1244     t[k] = xk;
1245     x[rip[k]] = xk;
1246   }
1247 
1248   ierr = ISRestoreIndices(isrow,&rip);CHKERRQ(ierr);
1249   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1250   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1251   PetscLogFlops(4*a->s_nz + A->m);
1252   PetscFunctionReturn(0);
1253 }
1254 
1255 /*
1256       Special case where the matrix was ILU(0) factored in the natural
1257    ordering. This eliminates the need for the column and row permutation.
1258 */
1259 #undef __FUNCT__
1260 #define __FUNCT__ "MatSolve_SeqSBAIJ_1_NaturalOrdering"
1261 int MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
1262 {
1263   Mat_SeqSBAIJ    *a = (Mat_SeqSBAIJ *)A->data;
1264   int             mbs=a->mbs,*ai=a->i,*aj=a->j,ierr;
1265   MatScalar       *aa=a->a,*v;
1266   PetscScalar     *x,*b,xk;
1267   int             nz,*vj,k;
1268 
1269   PetscFunctionBegin;
1270 
1271   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1272   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1273 
1274   /* solve U^T*D*y = b by forward substitution */
1275   ierr = PetscMemcpy(x,b,mbs*sizeof(PetscScalar));CHKERRQ(ierr);
1276   for (k=0; k<mbs; k++){
1277     v  = aa + ai[k];
1278     vj = aj + ai[k];
1279     xk = x[k];
1280     nz = ai[k+1] - ai[k];
1281     while (nz--) x[*vj++] += (*v++) * xk;
1282     x[k] = xk*aa[k];  /* note: aa[k] = 1/D(k) */
1283   }
1284 
1285   /* solve U*x = y by back substitution */
1286   for (k=mbs-2; k>=0; k--){
1287     v  = aa + ai[k];
1288     vj = aj + ai[k];
1289     xk = x[k];
1290     nz = ai[k+1] - ai[k];
1291     while (nz--) xk += (*v++) * x[*vj++];
1292     x[k] = xk;
1293   }
1294 
1295   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1296   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1297   PetscLogFlops(4*a->s_nz + A->m);
1298   PetscFunctionReturn(0);
1299 }
1300 
1301 #undef __FUNCT__
1302 #define __FUNCT__ "MatICCFactorSymbolic_SeqSBAIJ"
1303 int MatICCFactorSymbolic_SeqSBAIJ(Mat A,IS perm,PetscReal f,int levels,Mat *B)
1304 {
1305   Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b;
1306   int         *rip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j;
1307   int         *jutmp,bs = a->bs,bs2=a->bs2;
1308   int         m,nzi,realloc = 0,*levtmp;
1309   int         *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd;
1310   int         incrlev,*lev,lev_ik,shift;
1311   PetscTruth  perm_identity;
1312 
1313   PetscFunctionBegin;
1314 
1315   /* check whether perm is the identity mapping */
1316   ierr = ISIdentity(perm,&perm_identity);CHKERRQ(ierr);
1317   if (!perm_identity) a->permute = PETSC_TRUE;
1318   if (perm_identity){
1319     ai = a->i; aj = a->j;
1320   } else { /*  non-trivial permutation */
1321     ierr = MatReorderingSeqSBAIJ(A, perm);CHKERRQ(ierr);
1322     ai = a->inew; aj = a->jnew;
1323   }
1324 
1325   /* initialization */
1326   /* Don't know how many column pointers are needed so estimate.
1327      Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */
1328   ierr  = ISGetIndices(perm,&rip);CHKERRQ(ierr);
1329   umax  = (int)(f*ai[mbs] + 1);
1330   ierr  = PetscMalloc(umax*sizeof(int),&lev);CHKERRQ(ierr);
1331   umax += mbs + 1;
1332   shift = mbs + 1;
1333   ierr  = PetscMalloc((mbs+1)*sizeof(int),&iu);CHKERRQ(ierr);
1334   ierr  = PetscMalloc(umax*sizeof(int),&ju);CHKERRQ(ierr);
1335   iu[0] = mbs+1;
1336   juptr = mbs;
1337   ierr  = PetscMalloc(mbs*sizeof(int),&jl);CHKERRQ(ierr);
1338   ierr  = PetscMalloc(mbs*sizeof(int),&q);CHKERRQ(ierr);
1339   ierr  = PetscMalloc((mbs+1)*sizeof(int),&levtmp);CHKERRQ(ierr);
1340   for (i=0; i<mbs; i++){
1341     jl[i] = mbs; q[i] = 0;
1342   }
1343 
1344   /* for each row k */
1345   for (k=0; k<mbs; k++){
1346     nzk  = 0;
1347     q[k] = mbs;
1348     /* initialize nonzero structure of k-th row to row rip[k] of A */
1349     jmin = ai[rip[k]];
1350     jmax = ai[rip[k]+1];
1351     for (j=jmin; j<jmax; j++){
1352       vj = rip[aj[j]];
1353       if (vj > k){
1354         qm = k;
1355         do {
1356           m  = qm; qm = q[m];
1357         } while(qm < vj);
1358         if (qm == vj) {
1359           SETERRQ(1," error: duplicate entry in A\n");
1360         }
1361         nzk++;
1362         q[m]       = vj;
1363         q[vj]      = qm;
1364         levtmp[vj] = 0;   /* initialize lev for nonzero element */
1365       }
1366     }
1367 
1368     /* modify nonzero structure of k-th row by computing fill-in
1369        for each row i to be merged in */
1370     i = k;
1371     i = jl[i]; /* next pivot row (== 0 for symbolic factorization) */
1372 
1373     while (i < mbs){
1374       /* merge row i into k-th row */
1375       j      = iu[i];
1376       /* lev_ik = lev[j];  */
1377       lev_ik = lev[j-shift]; /* old */
1378       nzi    = iu[i+1] - (iu[i]+1);
1379       jmin   = iu[i] + 1; jmax = iu[i] + nzi;
1380       qm = k;
1381       for (j=jmin; j<jmax+1; j++){
1382         vj      = ju[j];
1383         /* incrlev = lev[j]+lev_ik+1; */
1384         incrlev = lev[j-shift]+lev_ik+1; /* old */
1385 
1386 	if (incrlev > levels) continue;
1387         do {
1388           m = qm; qm = q[m];
1389         } while (qm < vj);
1390         if (qm != vj){      /* a fill */
1391           nzk++; q[m] = vj; q[vj] = qm; qm = vj;
1392           levtmp[vj] = incrlev;
1393         } else {              /* already a nonzero element */
1394           if (levtmp[vj]>incrlev) levtmp[vj] = incrlev;
1395         }
1396       }
1397       i = jl[i]; /* next pivot row */
1398     }
1399 
1400     /* add k to row list for first nonzero element in k-th row */
1401     if (nzk > 1){
1402       i = q[k]; /* col value of first nonzero element in k_th row of U */
1403       jl[k] = jl[i]; jl[i] = k;
1404     }
1405     iu[k+1] = iu[k] + nzk;
1406 
1407     /* allocate more space to ju and lev if needed */
1408     if (iu[k+1] > umax) {
1409       /* estimate how much additional space we will need */
1410       /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
1411       /* just double the memory each time */
1412       maxadd = umax;
1413       if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2;
1414       umax += maxadd;
1415 
1416       /* allocate a longer ju */
1417       ierr     = PetscMalloc(umax*sizeof(int),&jutmp);CHKERRQ(ierr);
1418       ierr     = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr);
1419       ierr     = PetscFree(ju);CHKERRQ(ierr);
1420       ju       = jutmp;
1421 
1422       ierr     = PetscMalloc(umax*sizeof(int),&jutmp);CHKERRQ(ierr);
1423       /* ierr     = PetscMemcpy(jutmp,lev,iu[k]*sizeof(int));CHKERRQ(ierr); */
1424       ierr     = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(int));CHKERRQ(ierr); /* old */
1425       ierr     = PetscFree(lev);CHKERRQ(ierr);
1426       lev      = jutmp;
1427       realloc += 2; /* count how many times we realloc */
1428     }
1429 
1430     /* save nonzero structure of k-th row in ju */
1431     i=k;
1432     jumin = juptr + 1; juptr += nzk;
1433     /* lev[juptr] = 0; */ /* new! */
1434     for (j=jumin; j<juptr+1; j++){
1435       i      = q[i];
1436       ju[j]  = i;
1437       /* lev[j] = levtmp[i]; */
1438       lev[j-shift] = levtmp[i]; /* old */
1439     }
1440   }
1441 
1442   if (ai[mbs] != 0) {
1443     PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1444     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
1445     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Run with -pc_icc_fill %g or use \n",af);
1446     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:PCICCSetFill(pc,%g);\n",af);
1447     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:for best performance.\n");
1448   } else {
1449     PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Empty matrix.\n");
1450   }
1451 
1452   ierr = ISRestoreIndices(perm,&rip);CHKERRQ(ierr);
1453   ierr = PetscFree(q);CHKERRQ(ierr);
1454   ierr = PetscFree(jl);CHKERRQ(ierr);
1455   ierr = PetscFree(lev);CHKERRQ(ierr);
1456   ierr = PetscFree(levtmp);CHKERRQ(ierr);
1457 
1458   /* put together the new matrix */
1459   ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr);
1460   /* PetscLogObjectParent(*B,iperm); */
1461   b    = (Mat_SeqSBAIJ*)(*B)->data;
1462   ierr = PetscFree(b->imax);CHKERRQ(ierr);
1463   b->singlemalloc = PETSC_FALSE;
1464   /* the next line frees the default space generated by the Create() */
1465   ierr    = PetscFree(b->a);CHKERRQ(ierr);
1466   ierr    = PetscFree(b->ilen);CHKERRQ(ierr);
1467   ierr    = PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2,&b->a);CHKERRQ(ierr);
1468   b->j    = ju;
1469   b->i    = iu;
1470   b->diag = 0;
1471   b->ilen = 0;
1472   b->imax = 0;
1473 
1474   if (b->row) {
1475     ierr = ISDestroy(b->row);CHKERRQ(ierr);
1476   }
1477   if (b->icol) {
1478     ierr = ISDestroy(b->icol);CHKERRQ(ierr);
1479   }
1480   b->row  = perm;
1481   b->icol = perm;
1482   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1483   ierr    = PetscObjectReference((PetscObject)perm);CHKERRQ(ierr);
1484   ierr    = PetscMalloc((bs*mbs+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
1485   /* In b structure:  Free imax, ilen, old a, old j.
1486      Allocate idnew, solve_work, new a, new j */
1487   PetscLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar)));
1488   b->s_maxnz = b->s_nz = iu[mbs];
1489 
1490   (*B)->factor                 = FACTOR_CHOLESKY;
1491   (*B)->info.factor_mallocs    = realloc;
1492   (*B)->info.fill_ratio_given  = f;
1493   if (ai[mbs] != 0) {
1494     (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]);
1495   } else {
1496     (*B)->info.fill_ratio_needed = 0.0;
1497   }
1498 
1499   if (perm_identity){
1500     switch (bs) {
1501       case 1:
1502         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_1_NaturalOrdering;
1503         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_1_NaturalOrdering;
1504         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=1\n");
1505         break;
1506       case 2:
1507         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_2_NaturalOrdering;
1508         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_2_NaturalOrdering;
1509         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=2\n");
1510         break;
1511       case 3:
1512         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_3_NaturalOrdering;
1513         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_3_NaturalOrdering;
1514         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:sing special in-place natural ordering factor and solve BS=3\n");
1515         break;
1516       case 4:
1517         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_4_NaturalOrdering;
1518         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_4_NaturalOrdering;
1519         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=4\n");
1520         break;
1521       case 5:
1522         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_5_NaturalOrdering;
1523         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_5_NaturalOrdering;
1524         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=5\n");
1525         break;
1526       case 6:
1527         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_6_NaturalOrdering;
1528         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_6_NaturalOrdering;
1529         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=6\n");
1530         break;
1531       case 7:
1532         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_7_NaturalOrdering;
1533         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_7_NaturalOrdering;
1534         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS=7\n");
1535       break;
1536       default:
1537         (*B)->ops->choleskyfactornumeric = MatCholeskyFactorNumeric_SeqSBAIJ_N_NaturalOrdering;
1538         (*B)->ops->solve                 = MatSolve_SeqSBAIJ_N_NaturalOrdering;
1539         PetscLogInfo(A,"MatICCFactorSymbolic_SeqSBAIJ:Using special in-place natural ordering factor and solve BS>7\n");
1540       break;
1541     }
1542   }
1543 
1544   PetscFunctionReturn(0);
1545 }
1546 
1547 
1548 
1549