xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision af0996ce37bc06907c37d8d91773840993d61e62)
128e30874SSatish Balay #include <../src/mat/impls/baij/seq/baij.h>
2*af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
328e30874SSatish Balay 
428e30874SSatish Balay #undef __FUNCT__
528e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
628e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
728e30874SSatish Balay {
828e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
928e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
1028e30874SSatish Balay   PetscErrorCode    ierr;
1128e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
1228e30874SSatish Balay   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
1328e30874SSatish Balay   PetscInt          i,nz;
1428e30874SSatish Balay   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1528e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
1628e30874SSatish Balay   PetscScalar       *x,*s,*t,*ls;
1728e30874SSatish Balay   const PetscScalar *b;
1828e30874SSatish Balay 
1928e30874SSatish Balay   PetscFunctionBegin;
2028e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
2128e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2228e30874SSatish Balay   t    = a->solve_work;
2328e30874SSatish Balay 
2428e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2528e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2628e30874SSatish Balay 
2728e30874SSatish Balay   /* forward solve the lower triangular */
2828e30874SSatish Balay   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2928e30874SSatish Balay   for (i=1; i<n; i++) {
3028e30874SSatish Balay     v    = aa + bs2*ai[i];
3128e30874SSatish Balay     vi   = aj + ai[i];
3228e30874SSatish Balay     nz   = a->diag[i] - ai[i];
3328e30874SSatish Balay     s    = t + bs*i;
3428e30874SSatish Balay     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
3528e30874SSatish Balay     while (nz--) {
3628e30874SSatish Balay       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
3728e30874SSatish Balay       v += bs2;
3828e30874SSatish Balay     }
3928e30874SSatish Balay   }
4028e30874SSatish Balay   /* backward solve the upper triangular */
4128e30874SSatish Balay   ls = a->solve_work + A->cmap->n;
4228e30874SSatish Balay   for (i=n-1; i>=0; i--) {
4328e30874SSatish Balay     v    = aa + bs2*(a->diag[i] + 1);
4428e30874SSatish Balay     vi   = aj + a->diag[i] + 1;
4528e30874SSatish Balay     nz   = ai[i+1] - a->diag[i] - 1;
4628e30874SSatish Balay     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
4728e30874SSatish Balay     while (nz--) {
4828e30874SSatish Balay       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
4928e30874SSatish Balay       v += bs2;
5028e30874SSatish Balay     }
5128e30874SSatish Balay     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
5228e30874SSatish Balay     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
5328e30874SSatish Balay   }
5428e30874SSatish Balay 
5528e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5628e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5728e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
5828e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5928e30874SSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
6028e30874SSatish Balay   PetscFunctionReturn(0);
6128e30874SSatish Balay }
6228e30874SSatish Balay 
6328e30874SSatish Balay #undef __FUNCT__
6428e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
6528e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
6628e30874SSatish Balay {
6728e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
6828e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
6928e30874SSatish Balay   PetscErrorCode    ierr;
7028e30874SSatish Balay   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
7128e30874SSatish Balay   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
7228e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
7328e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
7428e30874SSatish Balay   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
7528e30874SSatish Balay   const PetscScalar *b;
7628e30874SSatish Balay 
7728e30874SSatish Balay   PetscFunctionBegin;
7828e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
7928e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
8028e30874SSatish Balay   t    = a->solve_work;
8128e30874SSatish Balay 
8228e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
8328e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
8428e30874SSatish Balay 
8528e30874SSatish Balay   /* forward solve the lower triangular */
8628e30874SSatish Balay   idx  = 7*(*r++);
8728e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
8828e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
8928e30874SSatish Balay   t[5] = b[5+idx]; t[6] = b[6+idx];
9028e30874SSatish Balay 
9128e30874SSatish Balay   for (i=1; i<n; i++) {
9228e30874SSatish Balay     v   = aa + 49*ai[i];
9328e30874SSatish Balay     vi  = aj + ai[i];
9428e30874SSatish Balay     nz  = diag[i] - ai[i];
9528e30874SSatish Balay     idx = 7*(*r++);
9628e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
9728e30874SSatish Balay     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
9828e30874SSatish Balay     while (nz--) {
9928e30874SSatish Balay       idx = 7*(*vi++);
10028e30874SSatish Balay       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
10128e30874SSatish Balay       x4  = t[3+idx];x5 = t[4+idx];
10228e30874SSatish Balay       x6  = t[5+idx];x7 = t[6+idx];
10328e30874SSatish Balay       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
10428e30874SSatish Balay       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
10528e30874SSatish Balay       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
10628e30874SSatish Balay       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
10728e30874SSatish Balay       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
10828e30874SSatish Balay       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
10928e30874SSatish Balay       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
11028e30874SSatish Balay       v  += 49;
11128e30874SSatish Balay     }
11228e30874SSatish Balay     idx      = 7*i;
11328e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
11428e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
11528e30874SSatish Balay     t[5+idx] = s6;t[6+idx] = s7;
11628e30874SSatish Balay   }
11728e30874SSatish Balay   /* backward solve the upper triangular */
11828e30874SSatish Balay   for (i=n-1; i>=0; i--) {
11928e30874SSatish Balay     v   = aa + 49*diag[i] + 49;
12028e30874SSatish Balay     vi  = aj + diag[i] + 1;
12128e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
12228e30874SSatish Balay     idt = 7*i;
12328e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
12428e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
12528e30874SSatish Balay     s6  = t[5+idt];s7 = t[6+idt];
12628e30874SSatish Balay     while (nz--) {
12728e30874SSatish Balay       idx = 7*(*vi++);
12828e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
12928e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
13028e30874SSatish Balay       x6  = t[5+idx]; x7 = t[6+idx];
13128e30874SSatish Balay       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
13228e30874SSatish Balay       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
13328e30874SSatish Balay       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
13428e30874SSatish Balay       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
13528e30874SSatish Balay       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13628e30874SSatish Balay       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13728e30874SSatish Balay       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13828e30874SSatish Balay       v  += 49;
13928e30874SSatish Balay     }
14028e30874SSatish Balay     idc    = 7*(*c--);
14128e30874SSatish Balay     v      = aa + 49*diag[i];
14228e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
14328e30874SSatish Balay                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
14428e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
14528e30874SSatish Balay                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
14628e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
14728e30874SSatish Balay                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
14828e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
14928e30874SSatish Balay                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
15028e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
15128e30874SSatish Balay                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
15228e30874SSatish Balay     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
15328e30874SSatish Balay                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
15428e30874SSatish Balay     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
15528e30874SSatish Balay                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
15628e30874SSatish Balay   }
15728e30874SSatish Balay 
15828e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
15928e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
16028e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
16128e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
16228e30874SSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
16328e30874SSatish Balay   PetscFunctionReturn(0);
16428e30874SSatish Balay }
16528e30874SSatish Balay 
16628e30874SSatish Balay #undef __FUNCT__
16728e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
16828e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
16928e30874SSatish Balay {
17028e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
17128e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
17228e30874SSatish Balay   PetscErrorCode    ierr;
17328e30874SSatish Balay   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
17428e30874SSatish Balay   const PetscInt    n=a->mbs,*rout,*cout,*vi;
17528e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
17628e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
17728e30874SSatish Balay   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
17828e30874SSatish Balay   const PetscScalar *b;
17928e30874SSatish Balay 
18028e30874SSatish Balay   PetscFunctionBegin;
18128e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
18228e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
18328e30874SSatish Balay   t    = a->solve_work;
18428e30874SSatish Balay 
18528e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
18628e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
18728e30874SSatish Balay 
18828e30874SSatish Balay   /* forward solve the lower triangular */
18928e30874SSatish Balay   idx  = 7*r[0];
19028e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
19128e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
19228e30874SSatish Balay   t[5] = b[5+idx]; t[6] = b[6+idx];
19328e30874SSatish Balay 
19428e30874SSatish Balay   for (i=1; i<n; i++) {
19528e30874SSatish Balay     v   = aa + 49*ai[i];
19628e30874SSatish Balay     vi  = aj + ai[i];
19728e30874SSatish Balay     nz  = ai[i+1] - ai[i];
19828e30874SSatish Balay     idx = 7*r[i];
19928e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
20028e30874SSatish Balay     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
20128e30874SSatish Balay     for (m=0; m<nz; m++) {
20228e30874SSatish Balay       idx = 7*vi[m];
20328e30874SSatish Balay       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
20428e30874SSatish Balay       x4  = t[3+idx];x5 = t[4+idx];
20528e30874SSatish Balay       x6  = t[5+idx];x7 = t[6+idx];
20628e30874SSatish Balay       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
20728e30874SSatish Balay       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
20828e30874SSatish Balay       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
20928e30874SSatish Balay       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
21028e30874SSatish Balay       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
21128e30874SSatish Balay       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
21228e30874SSatish Balay       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
21328e30874SSatish Balay       v  += 49;
21428e30874SSatish Balay     }
21528e30874SSatish Balay     idx      = 7*i;
21628e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
21728e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
21828e30874SSatish Balay     t[5+idx] = s6;t[6+idx] = s7;
21928e30874SSatish Balay   }
22028e30874SSatish Balay   /* backward solve the upper triangular */
22128e30874SSatish Balay   for (i=n-1; i>=0; i--) {
22228e30874SSatish Balay     v   = aa + 49*(adiag[i+1]+1);
22328e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
22428e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
22528e30874SSatish Balay     idt = 7*i;
22628e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
22728e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
22828e30874SSatish Balay     s6  = t[5+idt];s7 = t[6+idt];
22928e30874SSatish Balay     for (m=0; m<nz; m++) {
23028e30874SSatish Balay       idx = 7*vi[m];
23128e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
23228e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
23328e30874SSatish Balay       x6  = t[5+idx]; x7 = t[6+idx];
23428e30874SSatish Balay       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
23528e30874SSatish Balay       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
23628e30874SSatish Balay       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
23728e30874SSatish Balay       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
23828e30874SSatish Balay       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
23928e30874SSatish Balay       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
24028e30874SSatish Balay       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
24128e30874SSatish Balay       v  += 49;
24228e30874SSatish Balay     }
24328e30874SSatish Balay     idc    = 7*c[i];
24428e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
24528e30874SSatish Balay                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
24628e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
24728e30874SSatish Balay                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
24828e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
24928e30874SSatish Balay                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
25028e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
25128e30874SSatish Balay                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
25228e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
25328e30874SSatish Balay                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
25428e30874SSatish Balay     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
25528e30874SSatish Balay                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
25628e30874SSatish Balay     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
25728e30874SSatish Balay                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
25828e30874SSatish Balay   }
25928e30874SSatish Balay 
26028e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
26128e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
26228e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
26328e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
26428e30874SSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
26528e30874SSatish Balay   PetscFunctionReturn(0);
26628e30874SSatish Balay }
26728e30874SSatish Balay 
26828e30874SSatish Balay #undef __FUNCT__
26928e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
27028e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
27128e30874SSatish Balay {
27228e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
27328e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
27428e30874SSatish Balay   PetscErrorCode    ierr;
27528e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
27628e30874SSatish Balay   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
27728e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
27828e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
27928e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
28028e30874SSatish Balay   const PetscScalar *b;
28128e30874SSatish Balay 
28228e30874SSatish Balay   PetscFunctionBegin;
28328e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
28428e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28528e30874SSatish Balay   t    = a->solve_work;
28628e30874SSatish Balay 
28728e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
28828e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
28928e30874SSatish Balay 
29028e30874SSatish Balay   /* forward solve the lower triangular */
29128e30874SSatish Balay   idx  = 6*(*r++);
29228e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
29328e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
29428e30874SSatish Balay   t[4] = b[4+idx]; t[5] = b[5+idx];
29528e30874SSatish Balay   for (i=1; i<n; i++) {
29628e30874SSatish Balay     v   = aa + 36*ai[i];
29728e30874SSatish Balay     vi  = aj + ai[i];
29828e30874SSatish Balay     nz  = diag[i] - ai[i];
29928e30874SSatish Balay     idx = 6*(*r++);
30028e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
30128e30874SSatish Balay     s5  = b[4+idx]; s6 = b[5+idx];
30228e30874SSatish Balay     while (nz--) {
30328e30874SSatish Balay       idx = 6*(*vi++);
30428e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
30528e30874SSatish Balay       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
30628e30874SSatish Balay       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
30728e30874SSatish Balay       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
30828e30874SSatish Balay       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
30928e30874SSatish Balay       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
31028e30874SSatish Balay       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
31128e30874SSatish Balay       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
31228e30874SSatish Balay       v  += 36;
31328e30874SSatish Balay     }
31428e30874SSatish Balay     idx      = 6*i;
31528e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
31628e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4;
31728e30874SSatish Balay     t[4+idx] = s5;t[5+idx] = s6;
31828e30874SSatish Balay   }
31928e30874SSatish Balay   /* backward solve the upper triangular */
32028e30874SSatish Balay   for (i=n-1; i>=0; i--) {
32128e30874SSatish Balay     v   = aa + 36*diag[i] + 36;
32228e30874SSatish Balay     vi  = aj + diag[i] + 1;
32328e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
32428e30874SSatish Balay     idt = 6*i;
32528e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
32628e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt];
32728e30874SSatish Balay     s5  = t[4+idt];s6 = t[5+idt];
32828e30874SSatish Balay     while (nz--) {
32928e30874SSatish Balay       idx = 6*(*vi++);
33028e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
33128e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx];
33228e30874SSatish Balay       x5  = t[4+idx]; x6 = t[5+idx];
33328e30874SSatish Balay       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
33428e30874SSatish Balay       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
33528e30874SSatish Balay       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
33628e30874SSatish Balay       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
33728e30874SSatish Balay       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
33828e30874SSatish Balay       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
33928e30874SSatish Balay       v  += 36;
34028e30874SSatish Balay     }
34128e30874SSatish Balay     idc    = 6*(*c--);
34228e30874SSatish Balay     v      = aa + 36*diag[i];
34328e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
34428e30874SSatish Balay                         v[18]*s4+v[24]*s5+v[30]*s6;
34528e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
34628e30874SSatish Balay                           v[19]*s4+v[25]*s5+v[31]*s6;
34728e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
34828e30874SSatish Balay                           v[20]*s4+v[26]*s5+v[32]*s6;
34928e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
35028e30874SSatish Balay                           v[21]*s4+v[27]*s5+v[33]*s6;
35128e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
35228e30874SSatish Balay                           v[22]*s4+v[28]*s5+v[34]*s6;
35328e30874SSatish Balay     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
35428e30874SSatish Balay                           v[23]*s4+v[29]*s5+v[35]*s6;
35528e30874SSatish Balay   }
35628e30874SSatish Balay 
35728e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
35828e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
35928e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
36028e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
36128e30874SSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
36228e30874SSatish Balay   PetscFunctionReturn(0);
36328e30874SSatish Balay }
36428e30874SSatish Balay 
36528e30874SSatish Balay #undef __FUNCT__
36628e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
36728e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
36828e30874SSatish Balay {
36928e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
37028e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
37128e30874SSatish Balay   PetscErrorCode    ierr;
37228e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
37328e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
37428e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
37528e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
37628e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
37728e30874SSatish Balay   const PetscScalar *b;
37828e30874SSatish Balay 
37928e30874SSatish Balay   PetscFunctionBegin;
38028e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
38128e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
38228e30874SSatish Balay   t    = a->solve_work;
38328e30874SSatish Balay 
38428e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
38528e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
38628e30874SSatish Balay 
38728e30874SSatish Balay   /* forward solve the lower triangular */
38828e30874SSatish Balay   idx  = 6*r[0];
38928e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
39028e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
39128e30874SSatish Balay   t[4] = b[4+idx]; t[5] = b[5+idx];
39228e30874SSatish Balay   for (i=1; i<n; i++) {
39328e30874SSatish Balay     v   = aa + 36*ai[i];
39428e30874SSatish Balay     vi  = aj + ai[i];
39528e30874SSatish Balay     nz  = ai[i+1] - ai[i];
39628e30874SSatish Balay     idx = 6*r[i];
39728e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
39828e30874SSatish Balay     s5  = b[4+idx]; s6 = b[5+idx];
39928e30874SSatish Balay     for (m=0; m<nz; m++) {
40028e30874SSatish Balay       idx = 6*vi[m];
40128e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
40228e30874SSatish Balay       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
40328e30874SSatish Balay       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
40428e30874SSatish Balay       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
40528e30874SSatish Balay       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
40628e30874SSatish Balay       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
40728e30874SSatish Balay       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
40828e30874SSatish Balay       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
40928e30874SSatish Balay       v  += 36;
41028e30874SSatish Balay     }
41128e30874SSatish Balay     idx      = 6*i;
41228e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
41328e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4;
41428e30874SSatish Balay     t[4+idx] = s5;t[5+idx] = s6;
41528e30874SSatish Balay   }
41628e30874SSatish Balay   /* backward solve the upper triangular */
41728e30874SSatish Balay   for (i=n-1; i>=0; i--) {
41828e30874SSatish Balay     v   = aa + 36*(adiag[i+1]+1);
41928e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
42028e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
42128e30874SSatish Balay     idt = 6*i;
42228e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
42328e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt];
42428e30874SSatish Balay     s5  = t[4+idt];s6 = t[5+idt];
42528e30874SSatish Balay     for (m=0; m<nz; m++) {
42628e30874SSatish Balay       idx = 6*vi[m];
42728e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
42828e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx];
42928e30874SSatish Balay       x5  = t[4+idx]; x6 = t[5+idx];
43028e30874SSatish Balay       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
43128e30874SSatish Balay       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
43228e30874SSatish Balay       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
43328e30874SSatish Balay       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
43428e30874SSatish Balay       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
43528e30874SSatish Balay       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
43628e30874SSatish Balay       v  += 36;
43728e30874SSatish Balay     }
43828e30874SSatish Balay     idc    = 6*c[i];
43928e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
44028e30874SSatish Balay                         v[18]*s4+v[24]*s5+v[30]*s6;
44128e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
44228e30874SSatish Balay                           v[19]*s4+v[25]*s5+v[31]*s6;
44328e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
44428e30874SSatish Balay                           v[20]*s4+v[26]*s5+v[32]*s6;
44528e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
44628e30874SSatish Balay                           v[21]*s4+v[27]*s5+v[33]*s6;
44728e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
44828e30874SSatish Balay                           v[22]*s4+v[28]*s5+v[34]*s6;
44928e30874SSatish Balay     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
45028e30874SSatish Balay                           v[23]*s4+v[29]*s5+v[35]*s6;
45128e30874SSatish Balay   }
45228e30874SSatish Balay 
45328e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
45428e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
45528e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
45628e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
45728e30874SSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
45828e30874SSatish Balay   PetscFunctionReturn(0);
45928e30874SSatish Balay }
46028e30874SSatish Balay 
46128e30874SSatish Balay #undef __FUNCT__
46228e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
46328e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
46428e30874SSatish Balay {
46528e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
46628e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
46728e30874SSatish Balay   PetscErrorCode    ierr;
46828e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
46928e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
47028e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
47128e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
47228e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
47328e30874SSatish Balay   const PetscScalar *b;
47428e30874SSatish Balay 
47528e30874SSatish Balay   PetscFunctionBegin;
47628e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
47728e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
47828e30874SSatish Balay   t    = a->solve_work;
47928e30874SSatish Balay 
48028e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
48128e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
48228e30874SSatish Balay 
48328e30874SSatish Balay   /* forward solve the lower triangular */
48428e30874SSatish Balay   idx  = 5*(*r++);
48528e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
48628e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
48728e30874SSatish Balay   for (i=1; i<n; i++) {
48828e30874SSatish Balay     v   = aa + 25*ai[i];
48928e30874SSatish Balay     vi  = aj + ai[i];
49028e30874SSatish Balay     nz  = diag[i] - ai[i];
49128e30874SSatish Balay     idx = 5*(*r++);
49228e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
49328e30874SSatish Balay     s5  = b[4+idx];
49428e30874SSatish Balay     while (nz--) {
49528e30874SSatish Balay       idx = 5*(*vi++);
49628e30874SSatish Balay       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
49728e30874SSatish Balay       x4  = t[3+idx];x5 = t[4+idx];
49828e30874SSatish Balay       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
49928e30874SSatish Balay       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
50028e30874SSatish Balay       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
50128e30874SSatish Balay       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
50228e30874SSatish Balay       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
50328e30874SSatish Balay       v  += 25;
50428e30874SSatish Balay     }
50528e30874SSatish Balay     idx      = 5*i;
50628e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
50728e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
50828e30874SSatish Balay   }
50928e30874SSatish Balay   /* backward solve the upper triangular */
51028e30874SSatish Balay   for (i=n-1; i>=0; i--) {
51128e30874SSatish Balay     v   = aa + 25*diag[i] + 25;
51228e30874SSatish Balay     vi  = aj + diag[i] + 1;
51328e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
51428e30874SSatish Balay     idt = 5*i;
51528e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
51628e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
51728e30874SSatish Balay     while (nz--) {
51828e30874SSatish Balay       idx = 5*(*vi++);
51928e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
52028e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
52128e30874SSatish Balay       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
52228e30874SSatish Balay       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
52328e30874SSatish Balay       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
52428e30874SSatish Balay       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
52528e30874SSatish Balay       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
52628e30874SSatish Balay       v  += 25;
52728e30874SSatish Balay     }
52828e30874SSatish Balay     idc    = 5*(*c--);
52928e30874SSatish Balay     v      = aa + 25*diag[i];
53028e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
53128e30874SSatish Balay                         v[15]*s4+v[20]*s5;
53228e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
53328e30874SSatish Balay                           v[16]*s4+v[21]*s5;
53428e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
53528e30874SSatish Balay                           v[17]*s4+v[22]*s5;
53628e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
53728e30874SSatish Balay                           v[18]*s4+v[23]*s5;
53828e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
53928e30874SSatish Balay                           v[19]*s4+v[24]*s5;
54028e30874SSatish Balay   }
54128e30874SSatish Balay 
54228e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
54328e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
54428e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
54528e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54628e30874SSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
54728e30874SSatish Balay   PetscFunctionReturn(0);
54828e30874SSatish Balay }
54928e30874SSatish Balay 
55028e30874SSatish Balay #undef __FUNCT__
55128e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
55228e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
55328e30874SSatish Balay {
55428e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
55528e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
55628e30874SSatish Balay   PetscErrorCode    ierr;
55728e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
55828e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
55928e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
56028e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
56128e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
56228e30874SSatish Balay   const PetscScalar *b;
56328e30874SSatish Balay 
56428e30874SSatish Balay   PetscFunctionBegin;
56528e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
56628e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
56728e30874SSatish Balay   t    = a->solve_work;
56828e30874SSatish Balay 
56928e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
57028e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
57128e30874SSatish Balay 
57228e30874SSatish Balay   /* forward solve the lower triangular */
57328e30874SSatish Balay   idx  = 5*r[0];
57428e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
57528e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
57628e30874SSatish Balay   for (i=1; i<n; i++) {
57728e30874SSatish Balay     v   = aa + 25*ai[i];
57828e30874SSatish Balay     vi  = aj + ai[i];
57928e30874SSatish Balay     nz  = ai[i+1] - ai[i];
58028e30874SSatish Balay     idx = 5*r[i];
58128e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
58228e30874SSatish Balay     s5  = b[4+idx];
58328e30874SSatish Balay     for (m=0; m<nz; m++) {
58428e30874SSatish Balay       idx = 5*vi[m];
58528e30874SSatish Balay       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
58628e30874SSatish Balay       x4  = t[3+idx];x5 = t[4+idx];
58728e30874SSatish Balay       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
58828e30874SSatish Balay       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
58928e30874SSatish Balay       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
59028e30874SSatish Balay       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
59128e30874SSatish Balay       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
59228e30874SSatish Balay       v  += 25;
59328e30874SSatish Balay     }
59428e30874SSatish Balay     idx      = 5*i;
59528e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
59628e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
59728e30874SSatish Balay   }
59828e30874SSatish Balay   /* backward solve the upper triangular */
59928e30874SSatish Balay   for (i=n-1; i>=0; i--) {
60028e30874SSatish Balay     v   = aa + 25*(adiag[i+1]+1);
60128e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
60228e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
60328e30874SSatish Balay     idt = 5*i;
60428e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
60528e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
60628e30874SSatish Balay     for (m=0; m<nz; m++) {
60728e30874SSatish Balay       idx = 5*vi[m];
60828e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
60928e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
61028e30874SSatish Balay       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
61128e30874SSatish Balay       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
61228e30874SSatish Balay       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
61328e30874SSatish Balay       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
61428e30874SSatish Balay       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
61528e30874SSatish Balay       v  += 25;
61628e30874SSatish Balay     }
61728e30874SSatish Balay     idc    = 5*c[i];
61828e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
61928e30874SSatish Balay                         v[15]*s4+v[20]*s5;
62028e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
62128e30874SSatish Balay                           v[16]*s4+v[21]*s5;
62228e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
62328e30874SSatish Balay                           v[17]*s4+v[22]*s5;
62428e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
62528e30874SSatish Balay                           v[18]*s4+v[23]*s5;
62628e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
62728e30874SSatish Balay                           v[19]*s4+v[24]*s5;
62828e30874SSatish Balay   }
62928e30874SSatish Balay 
63028e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
63128e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
63228e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
63328e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
63428e30874SSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
63528e30874SSatish Balay   PetscFunctionReturn(0);
63628e30874SSatish Balay }
63728e30874SSatish Balay 
63828e30874SSatish Balay #undef __FUNCT__
63928e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
64028e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
64128e30874SSatish Balay {
64228e30874SSatish Balay   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
64328e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
64428e30874SSatish Balay   PetscErrorCode    ierr;
64528e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
64628e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
64728e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
64828e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
64928e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
65028e30874SSatish Balay   const PetscScalar *b;
65128e30874SSatish Balay 
65228e30874SSatish Balay   PetscFunctionBegin;
65328e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
65428e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
65528e30874SSatish Balay   t    = a->solve_work;
65628e30874SSatish Balay 
65728e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
65828e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
65928e30874SSatish Balay 
66028e30874SSatish Balay   /* forward solve the lower triangular */
66128e30874SSatish Balay   idx  = 4*(*r++);
66228e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
66328e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
66428e30874SSatish Balay   for (i=1; i<n; i++) {
66528e30874SSatish Balay     v   = aa + 16*ai[i];
66628e30874SSatish Balay     vi  = aj + ai[i];
66728e30874SSatish Balay     nz  = diag[i] - ai[i];
66828e30874SSatish Balay     idx = 4*(*r++);
66928e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
67028e30874SSatish Balay     while (nz--) {
67128e30874SSatish Balay       idx = 4*(*vi++);
67228e30874SSatish Balay       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
67328e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
67428e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
67528e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
67628e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
67728e30874SSatish Balay       v  += 16;
67828e30874SSatish Balay     }
67928e30874SSatish Balay     idx      = 4*i;
68028e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
68128e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4;
68228e30874SSatish Balay   }
68328e30874SSatish Balay   /* backward solve the upper triangular */
68428e30874SSatish Balay   for (i=n-1; i>=0; i--) {
68528e30874SSatish Balay     v   = aa + 16*diag[i] + 16;
68628e30874SSatish Balay     vi  = aj + diag[i] + 1;
68728e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
68828e30874SSatish Balay     idt = 4*i;
68928e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
69028e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt];
69128e30874SSatish Balay     while (nz--) {
69228e30874SSatish Balay       idx = 4*(*vi++);
69328e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
69428e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx];
69528e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
69628e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
69728e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
69828e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
69928e30874SSatish Balay       v  += 16;
70028e30874SSatish Balay     }
70128e30874SSatish Balay     idc      = 4*(*c--);
70228e30874SSatish Balay     v        = aa + 16*diag[i];
70328e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
70428e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
70528e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
70628e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
70728e30874SSatish Balay   }
70828e30874SSatish Balay 
70928e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
71028e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
71128e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
71228e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
71328e30874SSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
71428e30874SSatish Balay   PetscFunctionReturn(0);
71528e30874SSatish Balay }
71628e30874SSatish Balay 
71728e30874SSatish Balay #undef __FUNCT__
71828e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
71928e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
72028e30874SSatish Balay {
72128e30874SSatish Balay   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
72228e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
72328e30874SSatish Balay   PetscErrorCode    ierr;
72428e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
72528e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
72628e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
72728e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
72828e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
72928e30874SSatish Balay   const PetscScalar *b;
73028e30874SSatish Balay 
73128e30874SSatish Balay   PetscFunctionBegin;
73228e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
73328e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
73428e30874SSatish Balay   t    = a->solve_work;
73528e30874SSatish Balay 
73628e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
73728e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
73828e30874SSatish Balay 
73928e30874SSatish Balay   /* forward solve the lower triangular */
74028e30874SSatish Balay   idx  = 4*r[0];
74128e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
74228e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
74328e30874SSatish Balay   for (i=1; i<n; i++) {
74428e30874SSatish Balay     v   = aa + 16*ai[i];
74528e30874SSatish Balay     vi  = aj + ai[i];
74628e30874SSatish Balay     nz  = ai[i+1] - ai[i];
74728e30874SSatish Balay     idx = 4*r[i];
74828e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
74928e30874SSatish Balay     for (m=0; m<nz; m++) {
75028e30874SSatish Balay       idx = 4*vi[m];
75128e30874SSatish Balay       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
75228e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
75328e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
75428e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
75528e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
75628e30874SSatish Balay       v  += 16;
75728e30874SSatish Balay     }
75828e30874SSatish Balay     idx      = 4*i;
75928e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
76028e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4;
76128e30874SSatish Balay   }
76228e30874SSatish Balay   /* backward solve the upper triangular */
76328e30874SSatish Balay   for (i=n-1; i>=0; i--) {
76428e30874SSatish Balay     v   = aa + 16*(adiag[i+1]+1);
76528e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
76628e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
76728e30874SSatish Balay     idt = 4*i;
76828e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
76928e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt];
77028e30874SSatish Balay     for (m=0; m<nz; m++) {
77128e30874SSatish Balay       idx = 4*vi[m];
77228e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
77328e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx];
77428e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
77528e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
77628e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
77728e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
77828e30874SSatish Balay       v  += 16;
77928e30874SSatish Balay     }
78028e30874SSatish Balay     idc      = 4*c[i];
78128e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
78228e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
78328e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
78428e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
78528e30874SSatish Balay   }
78628e30874SSatish Balay 
78728e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
78828e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
78928e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
79028e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
79128e30874SSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
79228e30874SSatish Balay   PetscFunctionReturn(0);
79328e30874SSatish Balay }
79428e30874SSatish Balay 
79528e30874SSatish Balay #undef __FUNCT__
79628e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
79728e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
79828e30874SSatish Balay {
79928e30874SSatish Balay   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
80028e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
80128e30874SSatish Balay   PetscErrorCode    ierr;
80228e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
80328e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
80428e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
80528e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
80628e30874SSatish Balay   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
80728e30874SSatish Balay   PetscScalar       *x;
80828e30874SSatish Balay   const PetscScalar *b;
80928e30874SSatish Balay 
81028e30874SSatish Balay   PetscFunctionBegin;
81128e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
81228e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
81328e30874SSatish Balay   t    = (MatScalar*)a->solve_work;
81428e30874SSatish Balay 
81528e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
81628e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
81728e30874SSatish Balay 
81828e30874SSatish Balay   /* forward solve the lower triangular */
81928e30874SSatish Balay   idx  = 4*(*r++);
82028e30874SSatish Balay   t[0] = (MatScalar)b[idx];
82128e30874SSatish Balay   t[1] = (MatScalar)b[1+idx];
82228e30874SSatish Balay   t[2] = (MatScalar)b[2+idx];
82328e30874SSatish Balay   t[3] = (MatScalar)b[3+idx];
82428e30874SSatish Balay   for (i=1; i<n; i++) {
82528e30874SSatish Balay     v   = aa + 16*ai[i];
82628e30874SSatish Balay     vi  = aj + ai[i];
82728e30874SSatish Balay     nz  = diag[i] - ai[i];
82828e30874SSatish Balay     idx = 4*(*r++);
82928e30874SSatish Balay     s1  = (MatScalar)b[idx];
83028e30874SSatish Balay     s2  = (MatScalar)b[1+idx];
83128e30874SSatish Balay     s3  = (MatScalar)b[2+idx];
83228e30874SSatish Balay     s4  = (MatScalar)b[3+idx];
83328e30874SSatish Balay     while (nz--) {
83428e30874SSatish Balay       idx = 4*(*vi++);
83528e30874SSatish Balay       x1  = t[idx];
83628e30874SSatish Balay       x2  = t[1+idx];
83728e30874SSatish Balay       x3  = t[2+idx];
83828e30874SSatish Balay       x4  = t[3+idx];
83928e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
84028e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
84128e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
84228e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
84328e30874SSatish Balay       v  += 16;
84428e30874SSatish Balay     }
84528e30874SSatish Balay     idx      = 4*i;
84628e30874SSatish Balay     t[idx]   = s1;
84728e30874SSatish Balay     t[1+idx] = s2;
84828e30874SSatish Balay     t[2+idx] = s3;
84928e30874SSatish Balay     t[3+idx] = s4;
85028e30874SSatish Balay   }
85128e30874SSatish Balay   /* backward solve the upper triangular */
85228e30874SSatish Balay   for (i=n-1; i>=0; i--) {
85328e30874SSatish Balay     v   = aa + 16*diag[i] + 16;
85428e30874SSatish Balay     vi  = aj + diag[i] + 1;
85528e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
85628e30874SSatish Balay     idt = 4*i;
85728e30874SSatish Balay     s1  = t[idt];
85828e30874SSatish Balay     s2  = t[1+idt];
85928e30874SSatish Balay     s3  = t[2+idt];
86028e30874SSatish Balay     s4  = t[3+idt];
86128e30874SSatish Balay     while (nz--) {
86228e30874SSatish Balay       idx = 4*(*vi++);
86328e30874SSatish Balay       x1  = t[idx];
86428e30874SSatish Balay       x2  = t[1+idx];
86528e30874SSatish Balay       x3  = t[2+idx];
86628e30874SSatish Balay       x4  = t[3+idx];
86728e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
86828e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
86928e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
87028e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
87128e30874SSatish Balay       v  += 16;
87228e30874SSatish Balay     }
87328e30874SSatish Balay     idc      = 4*(*c--);
87428e30874SSatish Balay     v        = aa + 16*diag[i];
87528e30874SSatish Balay     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
87628e30874SSatish Balay     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
87728e30874SSatish Balay     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
87828e30874SSatish Balay     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
87928e30874SSatish Balay     x[idc]   = (PetscScalar)t[idt];
88028e30874SSatish Balay     x[1+idc] = (PetscScalar)t[1+idt];
88128e30874SSatish Balay     x[2+idc] = (PetscScalar)t[2+idt];
88228e30874SSatish Balay     x[3+idc] = (PetscScalar)t[3+idt];
88328e30874SSatish Balay   }
88428e30874SSatish Balay 
88528e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
88628e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
88728e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
88828e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
88928e30874SSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
89028e30874SSatish Balay   PetscFunctionReturn(0);
89128e30874SSatish Balay }
89228e30874SSatish Balay 
89328e30874SSatish Balay #if defined(PETSC_HAVE_SSE)
89428e30874SSatish Balay 
89528e30874SSatish Balay #include PETSC_HAVE_SSE
89628e30874SSatish Balay 
89728e30874SSatish Balay #undef __FUNCT__
89828e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
89928e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
90028e30874SSatish Balay {
90128e30874SSatish Balay   /*
90228e30874SSatish Balay      Note: This code uses demotion of double
90328e30874SSatish Balay      to float when performing the mixed-mode computation.
90428e30874SSatish Balay      This may not be numerically reasonable for all applications.
90528e30874SSatish Balay   */
90628e30874SSatish Balay   Mat_SeqBAIJ    *a   = (Mat_SeqBAIJ*)A->data;
90728e30874SSatish Balay   IS             iscol=a->col,isrow=a->row;
90828e30874SSatish Balay   PetscErrorCode ierr;
90928e30874SSatish Balay   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
91028e30874SSatish Balay   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
91128e30874SSatish Balay   MatScalar      *aa=a->a,*v;
91228e30874SSatish Balay   PetscScalar    *x,*b,*t;
91328e30874SSatish Balay 
91428e30874SSatish Balay   /* Make space in temp stack for 16 Byte Aligned arrays */
91528e30874SSatish Balay   float         ssealignedspace[11],*tmps,*tmpx;
91628e30874SSatish Balay   unsigned long offset;
91728e30874SSatish Balay 
91828e30874SSatish Balay   PetscFunctionBegin;
91928e30874SSatish Balay   SSE_SCOPE_BEGIN;
92028e30874SSatish Balay 
92128e30874SSatish Balay   offset = (unsigned long)ssealignedspace % 16;
92228e30874SSatish Balay   if (offset) offset = (16 - offset)/4;
92328e30874SSatish Balay   tmps = &ssealignedspace[offset];
92428e30874SSatish Balay   tmpx = &ssealignedspace[offset+4];
92528e30874SSatish Balay   PREFETCH_NTA(aa+16*ai[1]);
92628e30874SSatish Balay 
92728e30874SSatish Balay   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
92828e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
92928e30874SSatish Balay   t    = a->solve_work;
93028e30874SSatish Balay 
93128e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
93228e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
93328e30874SSatish Balay 
93428e30874SSatish Balay   /* forward solve the lower triangular */
93528e30874SSatish Balay   idx  = 4*(*r++);
93628e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
93728e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
93828e30874SSatish Balay   v    =  aa + 16*ai[1];
93928e30874SSatish Balay 
94028e30874SSatish Balay   for (i=1; i<n; ) {
94128e30874SSatish Balay     PREFETCH_NTA(&v[8]);
94228e30874SSatish Balay     vi  =  aj      + ai[i];
94328e30874SSatish Balay     nz  =  diag[i] - ai[i];
94428e30874SSatish Balay     idx =  4*(*r++);
94528e30874SSatish Balay 
94628e30874SSatish Balay     /* Demote sum from double to float */
94728e30874SSatish Balay     CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
94828e30874SSatish Balay     LOAD_PS(tmps,XMM7);
94928e30874SSatish Balay 
95028e30874SSatish Balay     while (nz--) {
95128e30874SSatish Balay       PREFETCH_NTA(&v[16]);
95228e30874SSatish Balay       idx = 4*(*vi++);
95328e30874SSatish Balay 
95428e30874SSatish Balay       /* Demote solution (so far) from double to float */
95528e30874SSatish Balay       CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
95628e30874SSatish Balay 
95728e30874SSatish Balay       /* 4x4 Matrix-Vector product with negative accumulation: */
95828e30874SSatish Balay       SSE_INLINE_BEGIN_2(tmpx,v)
95928e30874SSatish Balay       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
96028e30874SSatish Balay 
96128e30874SSatish Balay       /* First Column */
96228e30874SSatish Balay       SSE_COPY_PS(XMM0,XMM6)
96328e30874SSatish Balay       SSE_SHUFFLE(XMM0,XMM0,0x00)
96428e30874SSatish Balay       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
96528e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM0)
96628e30874SSatish Balay 
96728e30874SSatish Balay       /* Second Column */
96828e30874SSatish Balay       SSE_COPY_PS(XMM1,XMM6)
96928e30874SSatish Balay       SSE_SHUFFLE(XMM1,XMM1,0x55)
97028e30874SSatish Balay       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
97128e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM1)
97228e30874SSatish Balay 
97328e30874SSatish Balay       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
97428e30874SSatish Balay 
97528e30874SSatish Balay       /* Third Column */
97628e30874SSatish Balay       SSE_COPY_PS(XMM2,XMM6)
97728e30874SSatish Balay       SSE_SHUFFLE(XMM2,XMM2,0xAA)
97828e30874SSatish Balay       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
97928e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM2)
98028e30874SSatish Balay 
98128e30874SSatish Balay       /* Fourth Column */
98228e30874SSatish Balay       SSE_COPY_PS(XMM3,XMM6)
98328e30874SSatish Balay       SSE_SHUFFLE(XMM3,XMM3,0xFF)
98428e30874SSatish Balay       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
98528e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM3)
98628e30874SSatish Balay       SSE_INLINE_END_2
98728e30874SSatish Balay 
98828e30874SSatish Balay       v += 16;
98928e30874SSatish Balay     }
99028e30874SSatish Balay     idx = 4*i;
99128e30874SSatish Balay     v   = aa + 16*ai[++i];
99228e30874SSatish Balay     PREFETCH_NTA(v);
99328e30874SSatish Balay     STORE_PS(tmps,XMM7);
99428e30874SSatish Balay 
99528e30874SSatish Balay     /* Promote result from float to double */
99628e30874SSatish Balay     CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
99728e30874SSatish Balay   }
99828e30874SSatish Balay   /* backward solve the upper triangular */
99928e30874SSatish Balay   idt  = 4*(n-1);
100028e30874SSatish Balay   ai16 = 16*diag[n-1];
100128e30874SSatish Balay   v    = aa + ai16 + 16;
100228e30874SSatish Balay   for (i=n-1; i>=0; ) {
100328e30874SSatish Balay     PREFETCH_NTA(&v[8]);
100428e30874SSatish Balay     vi = aj + diag[i] + 1;
100528e30874SSatish Balay     nz = ai[i+1] - diag[i] - 1;
100628e30874SSatish Balay 
100728e30874SSatish Balay     /* Demote accumulator from double to float */
100828e30874SSatish Balay     CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
100928e30874SSatish Balay     LOAD_PS(tmps,XMM7);
101028e30874SSatish Balay 
101128e30874SSatish Balay     while (nz--) {
101228e30874SSatish Balay       PREFETCH_NTA(&v[16]);
101328e30874SSatish Balay       idx = 4*(*vi++);
101428e30874SSatish Balay 
101528e30874SSatish Balay       /* Demote solution (so far) from double to float */
101628e30874SSatish Balay       CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
101728e30874SSatish Balay 
101828e30874SSatish Balay       /* 4x4 Matrix-Vector Product with negative accumulation: */
101928e30874SSatish Balay       SSE_INLINE_BEGIN_2(tmpx,v)
102028e30874SSatish Balay       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
102128e30874SSatish Balay 
102228e30874SSatish Balay       /* First Column */
102328e30874SSatish Balay       SSE_COPY_PS(XMM0,XMM6)
102428e30874SSatish Balay       SSE_SHUFFLE(XMM0,XMM0,0x00)
102528e30874SSatish Balay       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
102628e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM0)
102728e30874SSatish Balay 
102828e30874SSatish Balay       /* Second Column */
102928e30874SSatish Balay       SSE_COPY_PS(XMM1,XMM6)
103028e30874SSatish Balay       SSE_SHUFFLE(XMM1,XMM1,0x55)
103128e30874SSatish Balay       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
103228e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM1)
103328e30874SSatish Balay 
103428e30874SSatish Balay       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
103528e30874SSatish Balay 
103628e30874SSatish Balay       /* Third Column */
103728e30874SSatish Balay       SSE_COPY_PS(XMM2,XMM6)
103828e30874SSatish Balay       SSE_SHUFFLE(XMM2,XMM2,0xAA)
103928e30874SSatish Balay       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
104028e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM2)
104128e30874SSatish Balay 
104228e30874SSatish Balay       /* Fourth Column */
104328e30874SSatish Balay       SSE_COPY_PS(XMM3,XMM6)
104428e30874SSatish Balay       SSE_SHUFFLE(XMM3,XMM3,0xFF)
104528e30874SSatish Balay       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
104628e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM3)
104728e30874SSatish Balay       SSE_INLINE_END_2
104828e30874SSatish Balay       v += 16;
104928e30874SSatish Balay     }
105028e30874SSatish Balay     v    = aa + ai16;
105128e30874SSatish Balay     ai16 = 16*diag[--i];
105228e30874SSatish Balay     PREFETCH_NTA(aa+ai16+16);
105328e30874SSatish Balay     /*
105428e30874SSatish Balay        Scale the result by the diagonal 4x4 block,
105528e30874SSatish Balay        which was inverted as part of the factorization
105628e30874SSatish Balay     */
105728e30874SSatish Balay     SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
105828e30874SSatish Balay     /* First Column */
105928e30874SSatish Balay     SSE_COPY_PS(XMM0,XMM7)
106028e30874SSatish Balay     SSE_SHUFFLE(XMM0,XMM0,0x00)
106128e30874SSatish Balay     SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
106228e30874SSatish Balay 
106328e30874SSatish Balay     /* Second Column */
106428e30874SSatish Balay     SSE_COPY_PS(XMM1,XMM7)
106528e30874SSatish Balay     SSE_SHUFFLE(XMM1,XMM1,0x55)
106628e30874SSatish Balay     SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
106728e30874SSatish Balay     SSE_ADD_PS(XMM0,XMM1)
106828e30874SSatish Balay 
106928e30874SSatish Balay     SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
107028e30874SSatish Balay 
107128e30874SSatish Balay     /* Third Column */
107228e30874SSatish Balay     SSE_COPY_PS(XMM2,XMM7)
107328e30874SSatish Balay     SSE_SHUFFLE(XMM2,XMM2,0xAA)
107428e30874SSatish Balay     SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
107528e30874SSatish Balay     SSE_ADD_PS(XMM0,XMM2)
107628e30874SSatish Balay 
107728e30874SSatish Balay     /* Fourth Column */
107828e30874SSatish Balay     SSE_COPY_PS(XMM3,XMM7)
107928e30874SSatish Balay     SSE_SHUFFLE(XMM3,XMM3,0xFF)
108028e30874SSatish Balay     SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
108128e30874SSatish Balay     SSE_ADD_PS(XMM0,XMM3)
108228e30874SSatish Balay 
108328e30874SSatish Balay     SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
108428e30874SSatish Balay     SSE_INLINE_END_3
108528e30874SSatish Balay 
108628e30874SSatish Balay     /* Promote solution from float to double */
108728e30874SSatish Balay     CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
108828e30874SSatish Balay 
108928e30874SSatish Balay     /* Apply reordering to t and stream into x.    */
109028e30874SSatish Balay     /* This way, x doesn't pollute the cache.      */
109128e30874SSatish Balay     /* Be careful with size: 2 doubles = 4 floats! */
109228e30874SSatish Balay     idc = 4*(*c--);
109328e30874SSatish Balay     SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
109428e30874SSatish Balay     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
109528e30874SSatish Balay     SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
109628e30874SSatish Balay     SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
109728e30874SSatish Balay     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
109828e30874SSatish Balay     SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
109928e30874SSatish Balay     SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
110028e30874SSatish Balay     SSE_INLINE_END_2
110128e30874SSatish Balay     v    = aa + ai16 + 16;
110228e30874SSatish Balay     idt -= 4;
110328e30874SSatish Balay   }
110428e30874SSatish Balay 
110528e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
110628e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
110728e30874SSatish Balay   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
110828e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
110928e30874SSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
111028e30874SSatish Balay   SSE_SCOPE_END;
111128e30874SSatish Balay   PetscFunctionReturn(0);
111228e30874SSatish Balay }
111328e30874SSatish Balay 
111428e30874SSatish Balay #endif
111528e30874SSatish Balay 
111628e30874SSatish Balay #undef __FUNCT__
111728e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
111828e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
111928e30874SSatish Balay {
112028e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
112128e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
112228e30874SSatish Balay   PetscErrorCode    ierr;
112328e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
112428e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
112528e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
112628e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
112728e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
112828e30874SSatish Balay   const PetscScalar *b;
112928e30874SSatish Balay 
113028e30874SSatish Balay   PetscFunctionBegin;
113128e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
113228e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
113328e30874SSatish Balay   t    = a->solve_work;
113428e30874SSatish Balay 
113528e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
113628e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
113728e30874SSatish Balay 
113828e30874SSatish Balay   /* forward solve the lower triangular */
113928e30874SSatish Balay   idx  = 3*(*r++);
114028e30874SSatish Balay   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
114128e30874SSatish Balay   for (i=1; i<n; i++) {
114228e30874SSatish Balay     v   = aa + 9*ai[i];
114328e30874SSatish Balay     vi  = aj + ai[i];
114428e30874SSatish Balay     nz  = diag[i] - ai[i];
114528e30874SSatish Balay     idx = 3*(*r++);
114628e30874SSatish Balay     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
114728e30874SSatish Balay     while (nz--) {
114828e30874SSatish Balay       idx = 3*(*vi++);
114928e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
115028e30874SSatish Balay       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
115128e30874SSatish Balay       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
115228e30874SSatish Balay       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
115328e30874SSatish Balay       v  += 9;
115428e30874SSatish Balay     }
115528e30874SSatish Balay     idx    = 3*i;
115628e30874SSatish Balay     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
115728e30874SSatish Balay   }
115828e30874SSatish Balay   /* backward solve the upper triangular */
115928e30874SSatish Balay   for (i=n-1; i>=0; i--) {
116028e30874SSatish Balay     v   = aa + 9*diag[i] + 9;
116128e30874SSatish Balay     vi  = aj + diag[i] + 1;
116228e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
116328e30874SSatish Balay     idt = 3*i;
116428e30874SSatish Balay     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
116528e30874SSatish Balay     while (nz--) {
116628e30874SSatish Balay       idx = 3*(*vi++);
116728e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
116828e30874SSatish Balay       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
116928e30874SSatish Balay       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
117028e30874SSatish Balay       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
117128e30874SSatish Balay       v  += 9;
117228e30874SSatish Balay     }
117328e30874SSatish Balay     idc      = 3*(*c--);
117428e30874SSatish Balay     v        = aa + 9*diag[i];
117528e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
117628e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
117728e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
117828e30874SSatish Balay   }
117928e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
118028e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
118128e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
118228e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
118328e30874SSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
118428e30874SSatish Balay   PetscFunctionReturn(0);
118528e30874SSatish Balay }
118628e30874SSatish Balay 
118728e30874SSatish Balay #undef __FUNCT__
118828e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
118928e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
119028e30874SSatish Balay {
119128e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
119228e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
119328e30874SSatish Balay   PetscErrorCode    ierr;
119428e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
119528e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
119628e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
119728e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
119828e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
119928e30874SSatish Balay   const PetscScalar *b;
120028e30874SSatish Balay 
120128e30874SSatish Balay   PetscFunctionBegin;
120228e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
120328e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
120428e30874SSatish Balay   t    = a->solve_work;
120528e30874SSatish Balay 
120628e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
120728e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
120828e30874SSatish Balay 
120928e30874SSatish Balay   /* forward solve the lower triangular */
121028e30874SSatish Balay   idx  = 3*r[0];
121128e30874SSatish Balay   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
121228e30874SSatish Balay   for (i=1; i<n; i++) {
121328e30874SSatish Balay     v   = aa + 9*ai[i];
121428e30874SSatish Balay     vi  = aj + ai[i];
121528e30874SSatish Balay     nz  = ai[i+1] - ai[i];
121628e30874SSatish Balay     idx = 3*r[i];
121728e30874SSatish Balay     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
121828e30874SSatish Balay     for (m=0; m<nz; m++) {
121928e30874SSatish Balay       idx = 3*vi[m];
122028e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
122128e30874SSatish Balay       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
122228e30874SSatish Balay       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
122328e30874SSatish Balay       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
122428e30874SSatish Balay       v  += 9;
122528e30874SSatish Balay     }
122628e30874SSatish Balay     idx    = 3*i;
122728e30874SSatish Balay     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
122828e30874SSatish Balay   }
122928e30874SSatish Balay   /* backward solve the upper triangular */
123028e30874SSatish Balay   for (i=n-1; i>=0; i--) {
123128e30874SSatish Balay     v   = aa + 9*(adiag[i+1]+1);
123228e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
123328e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
123428e30874SSatish Balay     idt = 3*i;
123528e30874SSatish Balay     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
123628e30874SSatish Balay     for (m=0; m<nz; m++) {
123728e30874SSatish Balay       idx = 3*vi[m];
123828e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
123928e30874SSatish Balay       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
124028e30874SSatish Balay       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
124128e30874SSatish Balay       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
124228e30874SSatish Balay       v  += 9;
124328e30874SSatish Balay     }
124428e30874SSatish Balay     idc      = 3*c[i];
124528e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
124628e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
124728e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
124828e30874SSatish Balay   }
124928e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
125028e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
125128e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
125228e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
125328e30874SSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
125428e30874SSatish Balay   PetscFunctionReturn(0);
125528e30874SSatish Balay }
125628e30874SSatish Balay 
125728e30874SSatish Balay #undef __FUNCT__
125828e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
125928e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
126028e30874SSatish Balay {
126128e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
126228e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
126328e30874SSatish Balay   PetscErrorCode    ierr;
126428e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
126528e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
126628e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
126728e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
126828e30874SSatish Balay   PetscScalar       *x,s1,s2,x1,x2,*t;
126928e30874SSatish Balay   const PetscScalar *b;
127028e30874SSatish Balay 
127128e30874SSatish Balay   PetscFunctionBegin;
127228e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
127328e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
127428e30874SSatish Balay   t    = a->solve_work;
127528e30874SSatish Balay 
127628e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
127728e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
127828e30874SSatish Balay 
127928e30874SSatish Balay   /* forward solve the lower triangular */
128028e30874SSatish Balay   idx  = 2*(*r++);
128128e30874SSatish Balay   t[0] = b[idx]; t[1] = b[1+idx];
128228e30874SSatish Balay   for (i=1; i<n; i++) {
128328e30874SSatish Balay     v   = aa + 4*ai[i];
128428e30874SSatish Balay     vi  = aj + ai[i];
128528e30874SSatish Balay     nz  = diag[i] - ai[i];
128628e30874SSatish Balay     idx = 2*(*r++);
128728e30874SSatish Balay     s1  = b[idx]; s2 = b[1+idx];
128828e30874SSatish Balay     while (nz--) {
128928e30874SSatish Balay       idx = 2*(*vi++);
129028e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx];
129128e30874SSatish Balay       s1 -= v[0]*x1 + v[2]*x2;
129228e30874SSatish Balay       s2 -= v[1]*x1 + v[3]*x2;
129328e30874SSatish Balay       v  += 4;
129428e30874SSatish Balay     }
129528e30874SSatish Balay     idx    = 2*i;
129628e30874SSatish Balay     t[idx] = s1; t[1+idx] = s2;
129728e30874SSatish Balay   }
129828e30874SSatish Balay   /* backward solve the upper triangular */
129928e30874SSatish Balay   for (i=n-1; i>=0; i--) {
130028e30874SSatish Balay     v   = aa + 4*diag[i] + 4;
130128e30874SSatish Balay     vi  = aj + diag[i] + 1;
130228e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
130328e30874SSatish Balay     idt = 2*i;
130428e30874SSatish Balay     s1  = t[idt]; s2 = t[1+idt];
130528e30874SSatish Balay     while (nz--) {
130628e30874SSatish Balay       idx = 2*(*vi++);
130728e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx];
130828e30874SSatish Balay       s1 -= v[0]*x1 + v[2]*x2;
130928e30874SSatish Balay       s2 -= v[1]*x1 + v[3]*x2;
131028e30874SSatish Balay       v  += 4;
131128e30874SSatish Balay     }
131228e30874SSatish Balay     idc      = 2*(*c--);
131328e30874SSatish Balay     v        = aa + 4*diag[i];
131428e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
131528e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
131628e30874SSatish Balay   }
131728e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
131828e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
131928e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
132028e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
132128e30874SSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
132228e30874SSatish Balay   PetscFunctionReturn(0);
132328e30874SSatish Balay }
132428e30874SSatish Balay 
132528e30874SSatish Balay #undef __FUNCT__
132628e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
132728e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
132828e30874SSatish Balay {
132928e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
133028e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
133128e30874SSatish Balay   PetscErrorCode    ierr;
133228e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
133328e30874SSatish Balay   PetscInt          i,nz,idx,jdx,idt,idc,m;
133428e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
133528e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
133628e30874SSatish Balay   PetscScalar       *x,s1,s2,x1,x2,*t;
133728e30874SSatish Balay   const PetscScalar *b;
133828e30874SSatish Balay 
133928e30874SSatish Balay   PetscFunctionBegin;
134028e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
134128e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
134228e30874SSatish Balay   t    = a->solve_work;
134328e30874SSatish Balay 
134428e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
134528e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
134628e30874SSatish Balay 
134728e30874SSatish Balay   /* forward solve the lower triangular */
134828e30874SSatish Balay   idx  = 2*r[0];
134928e30874SSatish Balay   t[0] = b[idx]; t[1] = b[1+idx];
135028e30874SSatish Balay   for (i=1; i<n; i++) {
135128e30874SSatish Balay     v   = aa + 4*ai[i];
135228e30874SSatish Balay     vi  = aj + ai[i];
135328e30874SSatish Balay     nz  = ai[i+1] - ai[i];
135428e30874SSatish Balay     idx = 2*r[i];
135528e30874SSatish Balay     s1  = b[idx]; s2 = b[1+idx];
135628e30874SSatish Balay     for (m=0; m<nz; m++) {
135728e30874SSatish Balay       jdx = 2*vi[m];
135828e30874SSatish Balay       x1  = t[jdx]; x2 = t[1+jdx];
135928e30874SSatish Balay       s1 -= v[0]*x1 + v[2]*x2;
136028e30874SSatish Balay       s2 -= v[1]*x1 + v[3]*x2;
136128e30874SSatish Balay       v  += 4;
136228e30874SSatish Balay     }
136328e30874SSatish Balay     idx    = 2*i;
136428e30874SSatish Balay     t[idx] = s1; t[1+idx] = s2;
136528e30874SSatish Balay   }
136628e30874SSatish Balay   /* backward solve the upper triangular */
136728e30874SSatish Balay   for (i=n-1; i>=0; i--) {
136828e30874SSatish Balay     v   = aa + 4*(adiag[i+1]+1);
136928e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
137028e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
137128e30874SSatish Balay     idt = 2*i;
137228e30874SSatish Balay     s1  = t[idt]; s2 = t[1+idt];
137328e30874SSatish Balay     for (m=0; m<nz; m++) {
137428e30874SSatish Balay       idx = 2*vi[m];
137528e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx];
137628e30874SSatish Balay       s1 -= v[0]*x1 + v[2]*x2;
137728e30874SSatish Balay       s2 -= v[1]*x1 + v[3]*x2;
137828e30874SSatish Balay       v  += 4;
137928e30874SSatish Balay     }
138028e30874SSatish Balay     idc      = 2*c[i];
138128e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
138228e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
138328e30874SSatish Balay   }
138428e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
138528e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
138628e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
138728e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
138828e30874SSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
138928e30874SSatish Balay   PetscFunctionReturn(0);
139028e30874SSatish Balay }
139128e30874SSatish Balay 
139228e30874SSatish Balay #undef __FUNCT__
139328e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
139428e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
139528e30874SSatish Balay {
139628e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
139728e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
139828e30874SSatish Balay   PetscErrorCode    ierr;
139928e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
140028e30874SSatish Balay   PetscInt          i,nz;
140128e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
140228e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
140328e30874SSatish Balay   PetscScalar       *x,s1,*t;
140428e30874SSatish Balay   const PetscScalar *b;
140528e30874SSatish Balay 
140628e30874SSatish Balay   PetscFunctionBegin;
140728e30874SSatish Balay   if (!n) PetscFunctionReturn(0);
140828e30874SSatish Balay 
140928e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
141028e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
141128e30874SSatish Balay   t    = a->solve_work;
141228e30874SSatish Balay 
141328e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
141428e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
141528e30874SSatish Balay 
141628e30874SSatish Balay   /* forward solve the lower triangular */
141728e30874SSatish Balay   t[0] = b[*r++];
141828e30874SSatish Balay   for (i=1; i<n; i++) {
141928e30874SSatish Balay     v  = aa + ai[i];
142028e30874SSatish Balay     vi = aj + ai[i];
142128e30874SSatish Balay     nz = diag[i] - ai[i];
142228e30874SSatish Balay     s1 = b[*r++];
142328e30874SSatish Balay     while (nz--) {
142428e30874SSatish Balay       s1 -= (*v++)*t[*vi++];
142528e30874SSatish Balay     }
142628e30874SSatish Balay     t[i] = s1;
142728e30874SSatish Balay   }
142828e30874SSatish Balay   /* backward solve the upper triangular */
142928e30874SSatish Balay   for (i=n-1; i>=0; i--) {
143028e30874SSatish Balay     v  = aa + diag[i] + 1;
143128e30874SSatish Balay     vi = aj + diag[i] + 1;
143228e30874SSatish Balay     nz = ai[i+1] - diag[i] - 1;
143328e30874SSatish Balay     s1 = t[i];
143428e30874SSatish Balay     while (nz--) {
143528e30874SSatish Balay       s1 -= (*v++)*t[*vi++];
143628e30874SSatish Balay     }
143728e30874SSatish Balay     x[*c--] = t[i] = aa[diag[i]]*s1;
143828e30874SSatish Balay   }
143928e30874SSatish Balay 
144028e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
144128e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
144228e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
144328e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
144428e30874SSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
144528e30874SSatish Balay   PetscFunctionReturn(0);
144628e30874SSatish Balay }
144728e30874SSatish Balay 
144828e30874SSatish Balay #undef __FUNCT__
144928e30874SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
145028e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
145128e30874SSatish Balay {
145228e30874SSatish Balay   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
145328e30874SSatish Balay   IS                iscol = a->col,isrow = a->row;
145428e30874SSatish Balay   PetscErrorCode    ierr;
145528e30874SSatish Balay   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
145628e30874SSatish Balay   const PetscInt    *rout,*cout,*r,*c;
145728e30874SSatish Balay   PetscScalar       *x,*tmp,sum;
145828e30874SSatish Balay   const PetscScalar *b;
145928e30874SSatish Balay   const MatScalar   *aa = a->a,*v;
146028e30874SSatish Balay 
146128e30874SSatish Balay   PetscFunctionBegin;
146228e30874SSatish Balay   if (!n) PetscFunctionReturn(0);
146328e30874SSatish Balay 
146428e30874SSatish Balay   ierr = VecGetArrayRead(bb,&b);CHKERRQ(ierr);
146528e30874SSatish Balay   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
146628e30874SSatish Balay   tmp  = a->solve_work;
146728e30874SSatish Balay 
146828e30874SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
146928e30874SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
147028e30874SSatish Balay 
147128e30874SSatish Balay   /* forward solve the lower triangular */
147228e30874SSatish Balay   tmp[0] = b[r[0]];
147328e30874SSatish Balay   v      = aa;
147428e30874SSatish Balay   vi     = aj;
147528e30874SSatish Balay   for (i=1; i<n; i++) {
147628e30874SSatish Balay     nz  = ai[i+1] - ai[i];
147728e30874SSatish Balay     sum = b[r[i]];
147828e30874SSatish Balay     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
147928e30874SSatish Balay     tmp[i] = sum;
148028e30874SSatish Balay     v     += nz; vi += nz;
148128e30874SSatish Balay   }
148228e30874SSatish Balay 
148328e30874SSatish Balay   /* backward solve the upper triangular */
148428e30874SSatish Balay   for (i=n-1; i>=0; i--) {
148528e30874SSatish Balay     v   = aa + adiag[i+1]+1;
148628e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
148728e30874SSatish Balay     nz  = adiag[i]-adiag[i+1]-1;
148828e30874SSatish Balay     sum = tmp[i];
148928e30874SSatish Balay     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
149028e30874SSatish Balay     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
149128e30874SSatish Balay   }
149228e30874SSatish Balay 
149328e30874SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
149428e30874SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
149528e30874SSatish Balay   ierr = VecRestoreArrayRead(bb,&b);CHKERRQ(ierr);
149628e30874SSatish Balay   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
149728e30874SSatish Balay   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
149828e30874SSatish Balay   PetscFunctionReturn(0);
149928e30874SSatish Balay }
150028e30874SSatish Balay 
1501