xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision 9566063d113dddea24716c546802770db7481bc0)
128e30874SSatish Balay #include <../src/mat/impls/baij/seq/baij.h>
2af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
328e30874SSatish Balay 
428e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
528e30874SSatish Balay {
628e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
728e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
828e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
928e30874SSatish Balay   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
1028e30874SSatish Balay   PetscInt          i,nz;
1128e30874SSatish Balay   const PetscInt    bs =A->rmap->bs,bs2=a->bs2;
1228e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
1328e30874SSatish Balay   PetscScalar       *x,*s,*t,*ls;
1428e30874SSatish Balay   const PetscScalar *b;
1528e30874SSatish Balay 
1628e30874SSatish Balay   PetscFunctionBegin;
17*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
18*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
1928e30874SSatish Balay   t    = a->solve_work;
2028e30874SSatish Balay 
21*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
22*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1);
2328e30874SSatish Balay 
2428e30874SSatish Balay   /* forward solve the lower triangular */
25*9566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(t,b+bs*(*r++),bs));
2628e30874SSatish Balay   for (i=1; i<n; i++) {
2728e30874SSatish Balay     v    = aa + bs2*ai[i];
2828e30874SSatish Balay     vi   = aj + ai[i];
2928e30874SSatish Balay     nz   = a->diag[i] - ai[i];
3028e30874SSatish Balay     s    = t + bs*i;
31*9566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(s,b+bs*(*r++),bs));
3228e30874SSatish Balay     while (nz--) {
3328e30874SSatish Balay       PetscKernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
3428e30874SSatish Balay       v += bs2;
3528e30874SSatish Balay     }
3628e30874SSatish Balay   }
3728e30874SSatish Balay   /* backward solve the upper triangular */
3828e30874SSatish Balay   ls = a->solve_work + A->cmap->n;
3928e30874SSatish Balay   for (i=n-1; i>=0; i--) {
4028e30874SSatish Balay     v    = aa + bs2*(a->diag[i] + 1);
4128e30874SSatish Balay     vi   = aj + a->diag[i] + 1;
4228e30874SSatish Balay     nz   = ai[i+1] - a->diag[i] - 1;
43*9566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(ls,t+i*bs,bs));
4428e30874SSatish Balay     while (nz--) {
4528e30874SSatish Balay       PetscKernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
4628e30874SSatish Balay       v += bs2;
4728e30874SSatish Balay     }
4828e30874SSatish Balay     PetscKernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
49*9566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(x + bs*(*c--),t+i*bs,bs));
5028e30874SSatish Balay   }
5128e30874SSatish Balay 
52*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
53*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
54*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
55*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
56*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n));
5728e30874SSatish Balay   PetscFunctionReturn(0);
5828e30874SSatish Balay }
5928e30874SSatish Balay 
6028e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
6128e30874SSatish Balay {
6228e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
6328e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
6428e30874SSatish Balay   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
6528e30874SSatish Balay   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
6628e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
6728e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
6828e30874SSatish Balay   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
6928e30874SSatish Balay   const PetscScalar *b;
7028e30874SSatish Balay 
7128e30874SSatish Balay   PetscFunctionBegin;
72*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
73*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
7428e30874SSatish Balay   t    = a->solve_work;
7528e30874SSatish Balay 
76*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
77*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1);
7828e30874SSatish Balay 
7928e30874SSatish Balay   /* forward solve the lower triangular */
8028e30874SSatish Balay   idx  = 7*(*r++);
8128e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
8228e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
8328e30874SSatish Balay   t[5] = b[5+idx]; t[6] = b[6+idx];
8428e30874SSatish Balay 
8528e30874SSatish Balay   for (i=1; i<n; i++) {
8628e30874SSatish Balay     v   = aa + 49*ai[i];
8728e30874SSatish Balay     vi  = aj + ai[i];
8828e30874SSatish Balay     nz  = diag[i] - ai[i];
8928e30874SSatish Balay     idx = 7*(*r++);
9028e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
9128e30874SSatish Balay     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
9228e30874SSatish Balay     while (nz--) {
9328e30874SSatish Balay       idx = 7*(*vi++);
9428e30874SSatish Balay       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
9528e30874SSatish Balay       x4  = t[3+idx];x5 = t[4+idx];
9628e30874SSatish Balay       x6  = t[5+idx];x7 = t[6+idx];
9728e30874SSatish Balay       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
9828e30874SSatish Balay       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
9928e30874SSatish Balay       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
10028e30874SSatish Balay       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
10128e30874SSatish Balay       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
10228e30874SSatish Balay       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
10328e30874SSatish Balay       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
10428e30874SSatish Balay       v  += 49;
10528e30874SSatish Balay     }
10628e30874SSatish Balay     idx      = 7*i;
10728e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
10828e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
10928e30874SSatish Balay     t[5+idx] = s6;t[6+idx] = s7;
11028e30874SSatish Balay   }
11128e30874SSatish Balay   /* backward solve the upper triangular */
11228e30874SSatish Balay   for (i=n-1; i>=0; i--) {
11328e30874SSatish Balay     v   = aa + 49*diag[i] + 49;
11428e30874SSatish Balay     vi  = aj + diag[i] + 1;
11528e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
11628e30874SSatish Balay     idt = 7*i;
11728e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
11828e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
11928e30874SSatish Balay     s6  = t[5+idt];s7 = t[6+idt];
12028e30874SSatish Balay     while (nz--) {
12128e30874SSatish Balay       idx = 7*(*vi++);
12228e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
12328e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
12428e30874SSatish Balay       x6  = t[5+idx]; x7 = t[6+idx];
12528e30874SSatish Balay       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
12628e30874SSatish Balay       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
12728e30874SSatish Balay       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
12828e30874SSatish Balay       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
12928e30874SSatish Balay       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13028e30874SSatish Balay       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13128e30874SSatish Balay       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13228e30874SSatish Balay       v  += 49;
13328e30874SSatish Balay     }
13428e30874SSatish Balay     idc    = 7*(*c--);
13528e30874SSatish Balay     v      = aa + 49*diag[i];
13628e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
13728e30874SSatish Balay                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
13828e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
13928e30874SSatish Balay                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
14028e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
14128e30874SSatish Balay                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
14228e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
14328e30874SSatish Balay                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
14428e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
14528e30874SSatish Balay                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
14628e30874SSatish Balay     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
14728e30874SSatish Balay                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
14828e30874SSatish Balay     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
14928e30874SSatish Balay                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
15028e30874SSatish Balay   }
15128e30874SSatish Balay 
152*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
153*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
154*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
155*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
156*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n));
15728e30874SSatish Balay   PetscFunctionReturn(0);
15828e30874SSatish Balay }
15928e30874SSatish Balay 
16028e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
16128e30874SSatish Balay {
16228e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
16328e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
16428e30874SSatish Balay   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
16528e30874SSatish Balay   const PetscInt    n=a->mbs,*rout,*cout,*vi;
16628e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
16728e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
16828e30874SSatish Balay   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
16928e30874SSatish Balay   const PetscScalar *b;
17028e30874SSatish Balay 
17128e30874SSatish Balay   PetscFunctionBegin;
172*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
173*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
17428e30874SSatish Balay   t    = a->solve_work;
17528e30874SSatish Balay 
176*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
177*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout;
17828e30874SSatish Balay 
17928e30874SSatish Balay   /* forward solve the lower triangular */
18028e30874SSatish Balay   idx  = 7*r[0];
18128e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
18228e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
18328e30874SSatish Balay   t[5] = b[5+idx]; t[6] = b[6+idx];
18428e30874SSatish Balay 
18528e30874SSatish Balay   for (i=1; i<n; i++) {
18628e30874SSatish Balay     v   = aa + 49*ai[i];
18728e30874SSatish Balay     vi  = aj + ai[i];
18828e30874SSatish Balay     nz  = ai[i+1] - ai[i];
18928e30874SSatish Balay     idx = 7*r[i];
19028e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
19128e30874SSatish Balay     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
19228e30874SSatish Balay     for (m=0; m<nz; m++) {
19328e30874SSatish Balay       idx = 7*vi[m];
19428e30874SSatish Balay       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
19528e30874SSatish Balay       x4  = t[3+idx];x5 = t[4+idx];
19628e30874SSatish Balay       x6  = t[5+idx];x7 = t[6+idx];
19728e30874SSatish Balay       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
19828e30874SSatish Balay       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
19928e30874SSatish Balay       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
20028e30874SSatish Balay       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
20128e30874SSatish Balay       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
20228e30874SSatish Balay       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
20328e30874SSatish Balay       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
20428e30874SSatish Balay       v  += 49;
20528e30874SSatish Balay     }
20628e30874SSatish Balay     idx      = 7*i;
20728e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
20828e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
20928e30874SSatish Balay     t[5+idx] = s6;t[6+idx] = s7;
21028e30874SSatish Balay   }
21128e30874SSatish Balay   /* backward solve the upper triangular */
21228e30874SSatish Balay   for (i=n-1; i>=0; i--) {
21328e30874SSatish Balay     v   = aa + 49*(adiag[i+1]+1);
21428e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
21528e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
21628e30874SSatish Balay     idt = 7*i;
21728e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
21828e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
21928e30874SSatish Balay     s6  = t[5+idt];s7 = t[6+idt];
22028e30874SSatish Balay     for (m=0; m<nz; m++) {
22128e30874SSatish Balay       idx = 7*vi[m];
22228e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
22328e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
22428e30874SSatish Balay       x6  = t[5+idx]; x7 = t[6+idx];
22528e30874SSatish Balay       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
22628e30874SSatish Balay       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
22728e30874SSatish Balay       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
22828e30874SSatish Balay       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
22928e30874SSatish Balay       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
23028e30874SSatish Balay       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
23128e30874SSatish Balay       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
23228e30874SSatish Balay       v  += 49;
23328e30874SSatish Balay     }
23428e30874SSatish Balay     idc    = 7*c[i];
23528e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
23628e30874SSatish Balay                         v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
23728e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
23828e30874SSatish Balay                           v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
23928e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
24028e30874SSatish Balay                           v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
24128e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
24228e30874SSatish Balay                           v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
24328e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
24428e30874SSatish Balay                           v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
24528e30874SSatish Balay     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
24628e30874SSatish Balay                           v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
24728e30874SSatish Balay     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
24828e30874SSatish Balay                           v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
24928e30874SSatish Balay   }
25028e30874SSatish Balay 
251*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
252*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
253*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
254*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
255*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n));
25628e30874SSatish Balay   PetscFunctionReturn(0);
25728e30874SSatish Balay }
25828e30874SSatish Balay 
25928e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
26028e30874SSatish Balay {
26128e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
26228e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
26328e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
26428e30874SSatish Balay   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
26528e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
26628e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
26728e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
26828e30874SSatish Balay   const PetscScalar *b;
26928e30874SSatish Balay 
27028e30874SSatish Balay   PetscFunctionBegin;
271*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
272*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
27328e30874SSatish Balay   t    = a->solve_work;
27428e30874SSatish Balay 
275*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
276*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1);
27728e30874SSatish Balay 
27828e30874SSatish Balay   /* forward solve the lower triangular */
27928e30874SSatish Balay   idx  = 6*(*r++);
28028e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
28128e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
28228e30874SSatish Balay   t[4] = b[4+idx]; t[5] = b[5+idx];
28328e30874SSatish Balay   for (i=1; i<n; i++) {
28428e30874SSatish Balay     v   = aa + 36*ai[i];
28528e30874SSatish Balay     vi  = aj + ai[i];
28628e30874SSatish Balay     nz  = diag[i] - ai[i];
28728e30874SSatish Balay     idx = 6*(*r++);
28828e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
28928e30874SSatish Balay     s5  = b[4+idx]; s6 = b[5+idx];
29028e30874SSatish Balay     while (nz--) {
29128e30874SSatish Balay       idx = 6*(*vi++);
29228e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
29328e30874SSatish Balay       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
29428e30874SSatish Balay       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
29528e30874SSatish Balay       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
29628e30874SSatish Balay       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
29728e30874SSatish Balay       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
29828e30874SSatish Balay       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
29928e30874SSatish Balay       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
30028e30874SSatish Balay       v  += 36;
30128e30874SSatish Balay     }
30228e30874SSatish Balay     idx      = 6*i;
30328e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
30428e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4;
30528e30874SSatish Balay     t[4+idx] = s5;t[5+idx] = s6;
30628e30874SSatish Balay   }
30728e30874SSatish Balay   /* backward solve the upper triangular */
30828e30874SSatish Balay   for (i=n-1; i>=0; i--) {
30928e30874SSatish Balay     v   = aa + 36*diag[i] + 36;
31028e30874SSatish Balay     vi  = aj + diag[i] + 1;
31128e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
31228e30874SSatish Balay     idt = 6*i;
31328e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
31428e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt];
31528e30874SSatish Balay     s5  = t[4+idt];s6 = t[5+idt];
31628e30874SSatish Balay     while (nz--) {
31728e30874SSatish Balay       idx = 6*(*vi++);
31828e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
31928e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx];
32028e30874SSatish Balay       x5  = t[4+idx]; x6 = t[5+idx];
32128e30874SSatish Balay       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
32228e30874SSatish Balay       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
32328e30874SSatish Balay       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
32428e30874SSatish Balay       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
32528e30874SSatish Balay       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
32628e30874SSatish Balay       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
32728e30874SSatish Balay       v  += 36;
32828e30874SSatish Balay     }
32928e30874SSatish Balay     idc    = 6*(*c--);
33028e30874SSatish Balay     v      = aa + 36*diag[i];
33128e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
33228e30874SSatish Balay                         v[18]*s4+v[24]*s5+v[30]*s6;
33328e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
33428e30874SSatish Balay                           v[19]*s4+v[25]*s5+v[31]*s6;
33528e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
33628e30874SSatish Balay                           v[20]*s4+v[26]*s5+v[32]*s6;
33728e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
33828e30874SSatish Balay                           v[21]*s4+v[27]*s5+v[33]*s6;
33928e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
34028e30874SSatish Balay                           v[22]*s4+v[28]*s5+v[34]*s6;
34128e30874SSatish Balay     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
34228e30874SSatish Balay                           v[23]*s4+v[29]*s5+v[35]*s6;
34328e30874SSatish Balay   }
34428e30874SSatish Balay 
345*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
346*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
347*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
348*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
349*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n));
35028e30874SSatish Balay   PetscFunctionReturn(0);
35128e30874SSatish Balay }
35228e30874SSatish Balay 
35328e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
35428e30874SSatish Balay {
35528e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
35628e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
35728e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
35828e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
35928e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
36028e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
36128e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
36228e30874SSatish Balay   const PetscScalar *b;
36328e30874SSatish Balay 
36428e30874SSatish Balay   PetscFunctionBegin;
365*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
366*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
36728e30874SSatish Balay   t    = a->solve_work;
36828e30874SSatish Balay 
369*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
370*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout;
37128e30874SSatish Balay 
37228e30874SSatish Balay   /* forward solve the lower triangular */
37328e30874SSatish Balay   idx  = 6*r[0];
37428e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
37528e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
37628e30874SSatish Balay   t[4] = b[4+idx]; t[5] = b[5+idx];
37728e30874SSatish Balay   for (i=1; i<n; i++) {
37828e30874SSatish Balay     v   = aa + 36*ai[i];
37928e30874SSatish Balay     vi  = aj + ai[i];
38028e30874SSatish Balay     nz  = ai[i+1] - ai[i];
38128e30874SSatish Balay     idx = 6*r[i];
38228e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
38328e30874SSatish Balay     s5  = b[4+idx]; s6 = b[5+idx];
38428e30874SSatish Balay     for (m=0; m<nz; m++) {
38528e30874SSatish Balay       idx = 6*vi[m];
38628e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
38728e30874SSatish Balay       x4  = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
38828e30874SSatish Balay       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
38928e30874SSatish Balay       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
39028e30874SSatish Balay       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
39128e30874SSatish Balay       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
39228e30874SSatish Balay       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
39328e30874SSatish Balay       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
39428e30874SSatish Balay       v  += 36;
39528e30874SSatish Balay     }
39628e30874SSatish Balay     idx      = 6*i;
39728e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
39828e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4;
39928e30874SSatish Balay     t[4+idx] = s5;t[5+idx] = s6;
40028e30874SSatish Balay   }
40128e30874SSatish Balay   /* backward solve the upper triangular */
40228e30874SSatish Balay   for (i=n-1; i>=0; i--) {
40328e30874SSatish Balay     v   = aa + 36*(adiag[i+1]+1);
40428e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
40528e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
40628e30874SSatish Balay     idt = 6*i;
40728e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
40828e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt];
40928e30874SSatish Balay     s5  = t[4+idt];s6 = t[5+idt];
41028e30874SSatish Balay     for (m=0; m<nz; m++) {
41128e30874SSatish Balay       idx = 6*vi[m];
41228e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
41328e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx];
41428e30874SSatish Balay       x5  = t[4+idx]; x6 = t[5+idx];
41528e30874SSatish Balay       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
41628e30874SSatish Balay       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
41728e30874SSatish Balay       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
41828e30874SSatish Balay       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
41928e30874SSatish Balay       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
42028e30874SSatish Balay       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
42128e30874SSatish Balay       v  += 36;
42228e30874SSatish Balay     }
42328e30874SSatish Balay     idc    = 6*c[i];
42428e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
42528e30874SSatish Balay                         v[18]*s4+v[24]*s5+v[30]*s6;
42628e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
42728e30874SSatish Balay                           v[19]*s4+v[25]*s5+v[31]*s6;
42828e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
42928e30874SSatish Balay                           v[20]*s4+v[26]*s5+v[32]*s6;
43028e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
43128e30874SSatish Balay                           v[21]*s4+v[27]*s5+v[33]*s6;
43228e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
43328e30874SSatish Balay                           v[22]*s4+v[28]*s5+v[34]*s6;
43428e30874SSatish Balay     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
43528e30874SSatish Balay                           v[23]*s4+v[29]*s5+v[35]*s6;
43628e30874SSatish Balay   }
43728e30874SSatish Balay 
438*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
439*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
440*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
441*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
442*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n));
44328e30874SSatish Balay   PetscFunctionReturn(0);
44428e30874SSatish Balay }
44528e30874SSatish Balay 
44628e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
44728e30874SSatish Balay {
44828e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
44928e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
45028e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
45128e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
45228e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
45328e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
45428e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
45528e30874SSatish Balay   const PetscScalar *b;
45628e30874SSatish Balay 
45728e30874SSatish Balay   PetscFunctionBegin;
458*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
459*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
46028e30874SSatish Balay   t    = a->solve_work;
46128e30874SSatish Balay 
462*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
463*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1);
46428e30874SSatish Balay 
46528e30874SSatish Balay   /* forward solve the lower triangular */
46628e30874SSatish Balay   idx  = 5*(*r++);
46728e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
46828e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
46928e30874SSatish Balay   for (i=1; i<n; i++) {
47028e30874SSatish Balay     v   = aa + 25*ai[i];
47128e30874SSatish Balay     vi  = aj + ai[i];
47228e30874SSatish Balay     nz  = diag[i] - ai[i];
47328e30874SSatish Balay     idx = 5*(*r++);
47428e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
47528e30874SSatish Balay     s5  = b[4+idx];
47628e30874SSatish Balay     while (nz--) {
47728e30874SSatish Balay       idx = 5*(*vi++);
47828e30874SSatish Balay       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
47928e30874SSatish Balay       x4  = t[3+idx];x5 = t[4+idx];
48028e30874SSatish Balay       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
48128e30874SSatish Balay       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
48228e30874SSatish Balay       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
48328e30874SSatish Balay       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
48428e30874SSatish Balay       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
48528e30874SSatish Balay       v  += 25;
48628e30874SSatish Balay     }
48728e30874SSatish Balay     idx      = 5*i;
48828e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
48928e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
49028e30874SSatish Balay   }
49128e30874SSatish Balay   /* backward solve the upper triangular */
49228e30874SSatish Balay   for (i=n-1; i>=0; i--) {
49328e30874SSatish Balay     v   = aa + 25*diag[i] + 25;
49428e30874SSatish Balay     vi  = aj + diag[i] + 1;
49528e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
49628e30874SSatish Balay     idt = 5*i;
49728e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
49828e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
49928e30874SSatish Balay     while (nz--) {
50028e30874SSatish Balay       idx = 5*(*vi++);
50128e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
50228e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
50328e30874SSatish Balay       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
50428e30874SSatish Balay       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
50528e30874SSatish Balay       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
50628e30874SSatish Balay       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
50728e30874SSatish Balay       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
50828e30874SSatish Balay       v  += 25;
50928e30874SSatish Balay     }
51028e30874SSatish Balay     idc    = 5*(*c--);
51128e30874SSatish Balay     v      = aa + 25*diag[i];
51228e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
51328e30874SSatish Balay                         v[15]*s4+v[20]*s5;
51428e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
51528e30874SSatish Balay                           v[16]*s4+v[21]*s5;
51628e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
51728e30874SSatish Balay                           v[17]*s4+v[22]*s5;
51828e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
51928e30874SSatish Balay                           v[18]*s4+v[23]*s5;
52028e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
52128e30874SSatish Balay                           v[19]*s4+v[24]*s5;
52228e30874SSatish Balay   }
52328e30874SSatish Balay 
524*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
525*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
526*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
527*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
528*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n));
52928e30874SSatish Balay   PetscFunctionReturn(0);
53028e30874SSatish Balay }
53128e30874SSatish Balay 
53228e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
53328e30874SSatish Balay {
53428e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
53528e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
53628e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
53728e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
53828e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
53928e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
54028e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
54128e30874SSatish Balay   const PetscScalar *b;
54228e30874SSatish Balay 
54328e30874SSatish Balay   PetscFunctionBegin;
544*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
545*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
54628e30874SSatish Balay   t    = a->solve_work;
54728e30874SSatish Balay 
548*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
549*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout;
55028e30874SSatish Balay 
55128e30874SSatish Balay   /* forward solve the lower triangular */
55228e30874SSatish Balay   idx  = 5*r[0];
55328e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
55428e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
55528e30874SSatish Balay   for (i=1; i<n; i++) {
55628e30874SSatish Balay     v   = aa + 25*ai[i];
55728e30874SSatish Balay     vi  = aj + ai[i];
55828e30874SSatish Balay     nz  = ai[i+1] - ai[i];
55928e30874SSatish Balay     idx = 5*r[i];
56028e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
56128e30874SSatish Balay     s5  = b[4+idx];
56228e30874SSatish Balay     for (m=0; m<nz; m++) {
56328e30874SSatish Balay       idx = 5*vi[m];
56428e30874SSatish Balay       x1  = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
56528e30874SSatish Balay       x4  = t[3+idx];x5 = t[4+idx];
56628e30874SSatish Balay       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
56728e30874SSatish Balay       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
56828e30874SSatish Balay       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
56928e30874SSatish Balay       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
57028e30874SSatish Balay       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
57128e30874SSatish Balay       v  += 25;
57228e30874SSatish Balay     }
57328e30874SSatish Balay     idx      = 5*i;
57428e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
57528e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
57628e30874SSatish Balay   }
57728e30874SSatish Balay   /* backward solve the upper triangular */
57828e30874SSatish Balay   for (i=n-1; i>=0; i--) {
57928e30874SSatish Balay     v   = aa + 25*(adiag[i+1]+1);
58028e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
58128e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
58228e30874SSatish Balay     idt = 5*i;
58328e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
58428e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
58528e30874SSatish Balay     for (m=0; m<nz; m++) {
58628e30874SSatish Balay       idx = 5*vi[m];
58728e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
58828e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
58928e30874SSatish Balay       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
59028e30874SSatish Balay       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
59128e30874SSatish Balay       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
59228e30874SSatish Balay       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
59328e30874SSatish Balay       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
59428e30874SSatish Balay       v  += 25;
59528e30874SSatish Balay     }
59628e30874SSatish Balay     idc    = 5*c[i];
59728e30874SSatish Balay     x[idc] = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
59828e30874SSatish Balay                         v[15]*s4+v[20]*s5;
59928e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
60028e30874SSatish Balay                           v[16]*s4+v[21]*s5;
60128e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
60228e30874SSatish Balay                           v[17]*s4+v[22]*s5;
60328e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
60428e30874SSatish Balay                           v[18]*s4+v[23]*s5;
60528e30874SSatish Balay     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
60628e30874SSatish Balay                           v[19]*s4+v[24]*s5;
60728e30874SSatish Balay   }
60828e30874SSatish Balay 
609*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
610*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
611*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
612*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
613*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n));
61428e30874SSatish Balay   PetscFunctionReturn(0);
61528e30874SSatish Balay }
61628e30874SSatish Balay 
61728e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
61828e30874SSatish Balay {
61928e30874SSatish Balay   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
62028e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
62128e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
62228e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
62328e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
62428e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
62528e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
62628e30874SSatish Balay   const PetscScalar *b;
62728e30874SSatish Balay 
62828e30874SSatish Balay   PetscFunctionBegin;
629*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
630*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
63128e30874SSatish Balay   t    = a->solve_work;
63228e30874SSatish Balay 
633*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
634*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1);
63528e30874SSatish Balay 
63628e30874SSatish Balay   /* forward solve the lower triangular */
63728e30874SSatish Balay   idx  = 4*(*r++);
63828e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
63928e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
64028e30874SSatish Balay   for (i=1; i<n; i++) {
64128e30874SSatish Balay     v   = aa + 16*ai[i];
64228e30874SSatish Balay     vi  = aj + ai[i];
64328e30874SSatish Balay     nz  = diag[i] - ai[i];
64428e30874SSatish Balay     idx = 4*(*r++);
64528e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
64628e30874SSatish Balay     while (nz--) {
64728e30874SSatish Balay       idx = 4*(*vi++);
64828e30874SSatish Balay       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
64928e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
65028e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
65128e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
65228e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
65328e30874SSatish Balay       v  += 16;
65428e30874SSatish Balay     }
65528e30874SSatish Balay     idx      = 4*i;
65628e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
65728e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4;
65828e30874SSatish Balay   }
65928e30874SSatish Balay   /* backward solve the upper triangular */
66028e30874SSatish Balay   for (i=n-1; i>=0; i--) {
66128e30874SSatish Balay     v   = aa + 16*diag[i] + 16;
66228e30874SSatish Balay     vi  = aj + diag[i] + 1;
66328e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
66428e30874SSatish Balay     idt = 4*i;
66528e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
66628e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt];
66728e30874SSatish Balay     while (nz--) {
66828e30874SSatish Balay       idx = 4*(*vi++);
66928e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
67028e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx];
67128e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
67228e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
67328e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
67428e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
67528e30874SSatish Balay       v  += 16;
67628e30874SSatish Balay     }
67728e30874SSatish Balay     idc      = 4*(*c--);
67828e30874SSatish Balay     v        = aa + 16*diag[i];
67928e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
68028e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
68128e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
68228e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
68328e30874SSatish Balay   }
68428e30874SSatish Balay 
685*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
686*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
687*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
688*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
689*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
69028e30874SSatish Balay   PetscFunctionReturn(0);
69128e30874SSatish Balay }
69228e30874SSatish Balay 
69328e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
69428e30874SSatish Balay {
69528e30874SSatish Balay   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
69628e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
69728e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
69828e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
69928e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
70028e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
70128e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
70228e30874SSatish Balay   const PetscScalar *b;
70328e30874SSatish Balay 
70428e30874SSatish Balay   PetscFunctionBegin;
705*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
706*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
70728e30874SSatish Balay   t    = a->solve_work;
70828e30874SSatish Balay 
709*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
710*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout;
71128e30874SSatish Balay 
71228e30874SSatish Balay   /* forward solve the lower triangular */
71328e30874SSatish Balay   idx  = 4*r[0];
71428e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
71528e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
71628e30874SSatish Balay   for (i=1; i<n; i++) {
71728e30874SSatish Balay     v   = aa + 16*ai[i];
71828e30874SSatish Balay     vi  = aj + ai[i];
71928e30874SSatish Balay     nz  = ai[i+1] - ai[i];
72028e30874SSatish Balay     idx = 4*r[i];
72128e30874SSatish Balay     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
72228e30874SSatish Balay     for (m=0; m<nz; m++) {
72328e30874SSatish Balay       idx = 4*vi[m];
72428e30874SSatish Balay       x1  = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
72528e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
72628e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
72728e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
72828e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
72928e30874SSatish Balay       v  += 16;
73028e30874SSatish Balay     }
73128e30874SSatish Balay     idx      = 4*i;
73228e30874SSatish Balay     t[idx]   = s1;t[1+idx] = s2;
73328e30874SSatish Balay     t[2+idx] = s3;t[3+idx] = s4;
73428e30874SSatish Balay   }
73528e30874SSatish Balay   /* backward solve the upper triangular */
73628e30874SSatish Balay   for (i=n-1; i>=0; i--) {
73728e30874SSatish Balay     v   = aa + 16*(adiag[i+1]+1);
73828e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
73928e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
74028e30874SSatish Balay     idt = 4*i;
74128e30874SSatish Balay     s1  = t[idt];  s2 = t[1+idt];
74228e30874SSatish Balay     s3  = t[2+idt];s4 = t[3+idt];
74328e30874SSatish Balay     for (m=0; m<nz; m++) {
74428e30874SSatish Balay       idx = 4*vi[m];
74528e30874SSatish Balay       x1  = t[idx];   x2 = t[1+idx];
74628e30874SSatish Balay       x3  = t[2+idx]; x4 = t[3+idx];
74728e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
74828e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
74928e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
75028e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
75128e30874SSatish Balay       v  += 16;
75228e30874SSatish Balay     }
75328e30874SSatish Balay     idc      = 4*c[i];
75428e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
75528e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
75628e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
75728e30874SSatish Balay     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
75828e30874SSatish Balay   }
75928e30874SSatish Balay 
760*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
761*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
762*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
763*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
764*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
76528e30874SSatish Balay   PetscFunctionReturn(0);
76628e30874SSatish Balay }
76728e30874SSatish Balay 
76828e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
76928e30874SSatish Balay {
77028e30874SSatish Balay   Mat_SeqBAIJ       *a   = (Mat_SeqBAIJ*)A->data;
77128e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
77228e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
77328e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
77428e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
77528e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
77628e30874SSatish Balay   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
77728e30874SSatish Balay   PetscScalar       *x;
77828e30874SSatish Balay   const PetscScalar *b;
77928e30874SSatish Balay 
78028e30874SSatish Balay   PetscFunctionBegin;
781*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
782*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
78328e30874SSatish Balay   t    = (MatScalar*)a->solve_work;
78428e30874SSatish Balay 
785*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
786*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1);
78728e30874SSatish Balay 
78828e30874SSatish Balay   /* forward solve the lower triangular */
78928e30874SSatish Balay   idx  = 4*(*r++);
79028e30874SSatish Balay   t[0] = (MatScalar)b[idx];
79128e30874SSatish Balay   t[1] = (MatScalar)b[1+idx];
79228e30874SSatish Balay   t[2] = (MatScalar)b[2+idx];
79328e30874SSatish Balay   t[3] = (MatScalar)b[3+idx];
79428e30874SSatish Balay   for (i=1; i<n; i++) {
79528e30874SSatish Balay     v   = aa + 16*ai[i];
79628e30874SSatish Balay     vi  = aj + ai[i];
79728e30874SSatish Balay     nz  = diag[i] - ai[i];
79828e30874SSatish Balay     idx = 4*(*r++);
79928e30874SSatish Balay     s1  = (MatScalar)b[idx];
80028e30874SSatish Balay     s2  = (MatScalar)b[1+idx];
80128e30874SSatish Balay     s3  = (MatScalar)b[2+idx];
80228e30874SSatish Balay     s4  = (MatScalar)b[3+idx];
80328e30874SSatish Balay     while (nz--) {
80428e30874SSatish Balay       idx = 4*(*vi++);
80528e30874SSatish Balay       x1  = t[idx];
80628e30874SSatish Balay       x2  = t[1+idx];
80728e30874SSatish Balay       x3  = t[2+idx];
80828e30874SSatish Balay       x4  = t[3+idx];
80928e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
81028e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
81128e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
81228e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
81328e30874SSatish Balay       v  += 16;
81428e30874SSatish Balay     }
81528e30874SSatish Balay     idx      = 4*i;
81628e30874SSatish Balay     t[idx]   = s1;
81728e30874SSatish Balay     t[1+idx] = s2;
81828e30874SSatish Balay     t[2+idx] = s3;
81928e30874SSatish Balay     t[3+idx] = s4;
82028e30874SSatish Balay   }
82128e30874SSatish Balay   /* backward solve the upper triangular */
82228e30874SSatish Balay   for (i=n-1; i>=0; i--) {
82328e30874SSatish Balay     v   = aa + 16*diag[i] + 16;
82428e30874SSatish Balay     vi  = aj + diag[i] + 1;
82528e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
82628e30874SSatish Balay     idt = 4*i;
82728e30874SSatish Balay     s1  = t[idt];
82828e30874SSatish Balay     s2  = t[1+idt];
82928e30874SSatish Balay     s3  = t[2+idt];
83028e30874SSatish Balay     s4  = t[3+idt];
83128e30874SSatish Balay     while (nz--) {
83228e30874SSatish Balay       idx = 4*(*vi++);
83328e30874SSatish Balay       x1  = t[idx];
83428e30874SSatish Balay       x2  = t[1+idx];
83528e30874SSatish Balay       x3  = t[2+idx];
83628e30874SSatish Balay       x4  = t[3+idx];
83728e30874SSatish Balay       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
83828e30874SSatish Balay       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
83928e30874SSatish Balay       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
84028e30874SSatish Balay       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
84128e30874SSatish Balay       v  += 16;
84228e30874SSatish Balay     }
84328e30874SSatish Balay     idc      = 4*(*c--);
84428e30874SSatish Balay     v        = aa + 16*diag[i];
84528e30874SSatish Balay     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
84628e30874SSatish Balay     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
84728e30874SSatish Balay     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
84828e30874SSatish Balay     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
84928e30874SSatish Balay     x[idc]   = (PetscScalar)t[idt];
85028e30874SSatish Balay     x[1+idc] = (PetscScalar)t[1+idt];
85128e30874SSatish Balay     x[2+idc] = (PetscScalar)t[2+idt];
85228e30874SSatish Balay     x[3+idc] = (PetscScalar)t[3+idt];
85328e30874SSatish Balay   }
85428e30874SSatish Balay 
855*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
856*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
857*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
858*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
859*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
86028e30874SSatish Balay   PetscFunctionReturn(0);
86128e30874SSatish Balay }
86228e30874SSatish Balay 
86328e30874SSatish Balay #if defined(PETSC_HAVE_SSE)
86428e30874SSatish Balay 
86528e30874SSatish Balay #include PETSC_HAVE_SSE
86628e30874SSatish Balay 
86728e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
86828e30874SSatish Balay {
86928e30874SSatish Balay   /*
87028e30874SSatish Balay      Note: This code uses demotion of double
87128e30874SSatish Balay      to float when performing the mixed-mode computation.
87228e30874SSatish Balay      This may not be numerically reasonable for all applications.
87328e30874SSatish Balay   */
87428e30874SSatish Balay   Mat_SeqBAIJ    *a   = (Mat_SeqBAIJ*)A->data;
87528e30874SSatish Balay   IS             iscol=a->col,isrow=a->row;
87628e30874SSatish Balay   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
87728e30874SSatish Balay   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
87828e30874SSatish Balay   MatScalar      *aa=a->a,*v;
87928e30874SSatish Balay   PetscScalar    *x,*b,*t;
88028e30874SSatish Balay 
88128e30874SSatish Balay   /* Make space in temp stack for 16 Byte Aligned arrays */
88228e30874SSatish Balay   float         ssealignedspace[11],*tmps,*tmpx;
88328e30874SSatish Balay   unsigned long offset;
88428e30874SSatish Balay 
88528e30874SSatish Balay   PetscFunctionBegin;
88628e30874SSatish Balay   SSE_SCOPE_BEGIN;
88728e30874SSatish Balay 
88828e30874SSatish Balay   offset = (unsigned long)ssealignedspace % 16;
88928e30874SSatish Balay   if (offset) offset = (16 - offset)/4;
89028e30874SSatish Balay   tmps = &ssealignedspace[offset];
89128e30874SSatish Balay   tmpx = &ssealignedspace[offset+4];
89228e30874SSatish Balay   PREFETCH_NTA(aa+16*ai[1]);
89328e30874SSatish Balay 
894*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(bb,&b));
895*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
89628e30874SSatish Balay   t    = a->solve_work;
89728e30874SSatish Balay 
898*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
899*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1);
90028e30874SSatish Balay 
90128e30874SSatish Balay   /* forward solve the lower triangular */
90228e30874SSatish Balay   idx  = 4*(*r++);
90328e30874SSatish Balay   t[0] = b[idx];   t[1] = b[1+idx];
90428e30874SSatish Balay   t[2] = b[2+idx]; t[3] = b[3+idx];
90528e30874SSatish Balay   v    =  aa + 16*ai[1];
90628e30874SSatish Balay 
90728e30874SSatish Balay   for (i=1; i<n;) {
90828e30874SSatish Balay     PREFETCH_NTA(&v[8]);
90928e30874SSatish Balay     vi  =  aj      + ai[i];
91028e30874SSatish Balay     nz  =  diag[i] - ai[i];
91128e30874SSatish Balay     idx =  4*(*r++);
91228e30874SSatish Balay 
91328e30874SSatish Balay     /* Demote sum from double to float */
91428e30874SSatish Balay     CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
91528e30874SSatish Balay     LOAD_PS(tmps,XMM7);
91628e30874SSatish Balay 
91728e30874SSatish Balay     while (nz--) {
91828e30874SSatish Balay       PREFETCH_NTA(&v[16]);
91928e30874SSatish Balay       idx = 4*(*vi++);
92028e30874SSatish Balay 
92128e30874SSatish Balay       /* Demote solution (so far) from double to float */
92228e30874SSatish Balay       CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
92328e30874SSatish Balay 
92428e30874SSatish Balay       /* 4x4 Matrix-Vector product with negative accumulation: */
92528e30874SSatish Balay       SSE_INLINE_BEGIN_2(tmpx,v)
92628e30874SSatish Balay       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
92728e30874SSatish Balay 
92828e30874SSatish Balay       /* First Column */
92928e30874SSatish Balay       SSE_COPY_PS(XMM0,XMM6)
93028e30874SSatish Balay       SSE_SHUFFLE(XMM0,XMM0,0x00)
93128e30874SSatish Balay       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
93228e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM0)
93328e30874SSatish Balay 
93428e30874SSatish Balay       /* Second Column */
93528e30874SSatish Balay       SSE_COPY_PS(XMM1,XMM6)
93628e30874SSatish Balay       SSE_SHUFFLE(XMM1,XMM1,0x55)
93728e30874SSatish Balay       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
93828e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM1)
93928e30874SSatish Balay 
94028e30874SSatish Balay       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
94128e30874SSatish Balay 
94228e30874SSatish Balay       /* Third Column */
94328e30874SSatish Balay       SSE_COPY_PS(XMM2,XMM6)
94428e30874SSatish Balay       SSE_SHUFFLE(XMM2,XMM2,0xAA)
94528e30874SSatish Balay       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
94628e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM2)
94728e30874SSatish Balay 
94828e30874SSatish Balay       /* Fourth Column */
94928e30874SSatish Balay       SSE_COPY_PS(XMM3,XMM6)
95028e30874SSatish Balay       SSE_SHUFFLE(XMM3,XMM3,0xFF)
95128e30874SSatish Balay       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
95228e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM3)
95328e30874SSatish Balay       SSE_INLINE_END_2
95428e30874SSatish Balay 
95528e30874SSatish Balay       v += 16;
95628e30874SSatish Balay     }
95728e30874SSatish Balay     idx = 4*i;
95828e30874SSatish Balay     v   = aa + 16*ai[++i];
95928e30874SSatish Balay     PREFETCH_NTA(v);
96028e30874SSatish Balay     STORE_PS(tmps,XMM7);
96128e30874SSatish Balay 
96228e30874SSatish Balay     /* Promote result from float to double */
96328e30874SSatish Balay     CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
96428e30874SSatish Balay   }
96528e30874SSatish Balay   /* backward solve the upper triangular */
96628e30874SSatish Balay   idt  = 4*(n-1);
96728e30874SSatish Balay   ai16 = 16*diag[n-1];
96828e30874SSatish Balay   v    = aa + ai16 + 16;
96928e30874SSatish Balay   for (i=n-1; i>=0;) {
97028e30874SSatish Balay     PREFETCH_NTA(&v[8]);
97128e30874SSatish Balay     vi = aj + diag[i] + 1;
97228e30874SSatish Balay     nz = ai[i+1] - diag[i] - 1;
97328e30874SSatish Balay 
97428e30874SSatish Balay     /* Demote accumulator from double to float */
97528e30874SSatish Balay     CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
97628e30874SSatish Balay     LOAD_PS(tmps,XMM7);
97728e30874SSatish Balay 
97828e30874SSatish Balay     while (nz--) {
97928e30874SSatish Balay       PREFETCH_NTA(&v[16]);
98028e30874SSatish Balay       idx = 4*(*vi++);
98128e30874SSatish Balay 
98228e30874SSatish Balay       /* Demote solution (so far) from double to float */
98328e30874SSatish Balay       CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
98428e30874SSatish Balay 
98528e30874SSatish Balay       /* 4x4 Matrix-Vector Product with negative accumulation: */
98628e30874SSatish Balay       SSE_INLINE_BEGIN_2(tmpx,v)
98728e30874SSatish Balay       SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
98828e30874SSatish Balay 
98928e30874SSatish Balay       /* First Column */
99028e30874SSatish Balay       SSE_COPY_PS(XMM0,XMM6)
99128e30874SSatish Balay       SSE_SHUFFLE(XMM0,XMM0,0x00)
99228e30874SSatish Balay       SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
99328e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM0)
99428e30874SSatish Balay 
99528e30874SSatish Balay       /* Second Column */
99628e30874SSatish Balay       SSE_COPY_PS(XMM1,XMM6)
99728e30874SSatish Balay       SSE_SHUFFLE(XMM1,XMM1,0x55)
99828e30874SSatish Balay       SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
99928e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM1)
100028e30874SSatish Balay 
100128e30874SSatish Balay       SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
100228e30874SSatish Balay 
100328e30874SSatish Balay       /* Third Column */
100428e30874SSatish Balay       SSE_COPY_PS(XMM2,XMM6)
100528e30874SSatish Balay       SSE_SHUFFLE(XMM2,XMM2,0xAA)
100628e30874SSatish Balay       SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
100728e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM2)
100828e30874SSatish Balay 
100928e30874SSatish Balay       /* Fourth Column */
101028e30874SSatish Balay       SSE_COPY_PS(XMM3,XMM6)
101128e30874SSatish Balay       SSE_SHUFFLE(XMM3,XMM3,0xFF)
101228e30874SSatish Balay       SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
101328e30874SSatish Balay       SSE_SUB_PS(XMM7,XMM3)
101428e30874SSatish Balay       SSE_INLINE_END_2
101528e30874SSatish Balay       v += 16;
101628e30874SSatish Balay     }
101728e30874SSatish Balay     v    = aa + ai16;
101828e30874SSatish Balay     ai16 = 16*diag[--i];
101928e30874SSatish Balay     PREFETCH_NTA(aa+ai16+16);
102028e30874SSatish Balay     /*
102128e30874SSatish Balay        Scale the result by the diagonal 4x4 block,
102228e30874SSatish Balay        which was inverted as part of the factorization
102328e30874SSatish Balay     */
102428e30874SSatish Balay     SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
102528e30874SSatish Balay     /* First Column */
102628e30874SSatish Balay     SSE_COPY_PS(XMM0,XMM7)
102728e30874SSatish Balay     SSE_SHUFFLE(XMM0,XMM0,0x00)
102828e30874SSatish Balay     SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
102928e30874SSatish Balay 
103028e30874SSatish Balay     /* Second Column */
103128e30874SSatish Balay     SSE_COPY_PS(XMM1,XMM7)
103228e30874SSatish Balay     SSE_SHUFFLE(XMM1,XMM1,0x55)
103328e30874SSatish Balay     SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
103428e30874SSatish Balay     SSE_ADD_PS(XMM0,XMM1)
103528e30874SSatish Balay 
103628e30874SSatish Balay     SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
103728e30874SSatish Balay 
103828e30874SSatish Balay     /* Third Column */
103928e30874SSatish Balay     SSE_COPY_PS(XMM2,XMM7)
104028e30874SSatish Balay     SSE_SHUFFLE(XMM2,XMM2,0xAA)
104128e30874SSatish Balay     SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
104228e30874SSatish Balay     SSE_ADD_PS(XMM0,XMM2)
104328e30874SSatish Balay 
104428e30874SSatish Balay     /* Fourth Column */
104528e30874SSatish Balay     SSE_COPY_PS(XMM3,XMM7)
104628e30874SSatish Balay     SSE_SHUFFLE(XMM3,XMM3,0xFF)
104728e30874SSatish Balay     SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
104828e30874SSatish Balay     SSE_ADD_PS(XMM0,XMM3)
104928e30874SSatish Balay 
105028e30874SSatish Balay     SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
105128e30874SSatish Balay     SSE_INLINE_END_3
105228e30874SSatish Balay 
105328e30874SSatish Balay     /* Promote solution from float to double */
105428e30874SSatish Balay     CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
105528e30874SSatish Balay 
105628e30874SSatish Balay     /* Apply reordering to t and stream into x.    */
105728e30874SSatish Balay     /* This way, x doesn't pollute the cache.      */
105828e30874SSatish Balay     /* Be careful with size: 2 doubles = 4 floats! */
105928e30874SSatish Balay     idc = 4*(*c--);
106028e30874SSatish Balay     SSE_INLINE_BEGIN_2((float*)&t[idt],(float*)&x[idc])
106128e30874SSatish Balay     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
106228e30874SSatish Balay     SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
106328e30874SSatish Balay     SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
106428e30874SSatish Balay     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
106528e30874SSatish Balay     SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
106628e30874SSatish Balay     SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
106728e30874SSatish Balay     SSE_INLINE_END_2
106828e30874SSatish Balay     v    = aa + ai16 + 16;
106928e30874SSatish Balay     idt -= 4;
107028e30874SSatish Balay   }
107128e30874SSatish Balay 
1072*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
1073*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
1074*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(bb,&b));
1075*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
1076*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n));
107728e30874SSatish Balay   SSE_SCOPE_END;
107828e30874SSatish Balay   PetscFunctionReturn(0);
107928e30874SSatish Balay }
108028e30874SSatish Balay 
108128e30874SSatish Balay #endif
108228e30874SSatish Balay 
108328e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
108428e30874SSatish Balay {
108528e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
108628e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
108728e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
108828e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
108928e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
109028e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
109128e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
109228e30874SSatish Balay   const PetscScalar *b;
109328e30874SSatish Balay 
109428e30874SSatish Balay   PetscFunctionBegin;
1095*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
1096*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
109728e30874SSatish Balay   t    = a->solve_work;
109828e30874SSatish Balay 
1099*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
1100*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1);
110128e30874SSatish Balay 
110228e30874SSatish Balay   /* forward solve the lower triangular */
110328e30874SSatish Balay   idx  = 3*(*r++);
110428e30874SSatish Balay   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
110528e30874SSatish Balay   for (i=1; i<n; i++) {
110628e30874SSatish Balay     v   = aa + 9*ai[i];
110728e30874SSatish Balay     vi  = aj + ai[i];
110828e30874SSatish Balay     nz  = diag[i] - ai[i];
110928e30874SSatish Balay     idx = 3*(*r++);
111028e30874SSatish Balay     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
111128e30874SSatish Balay     while (nz--) {
111228e30874SSatish Balay       idx = 3*(*vi++);
111328e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
111428e30874SSatish Balay       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
111528e30874SSatish Balay       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
111628e30874SSatish Balay       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
111728e30874SSatish Balay       v  += 9;
111828e30874SSatish Balay     }
111928e30874SSatish Balay     idx    = 3*i;
112028e30874SSatish Balay     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
112128e30874SSatish Balay   }
112228e30874SSatish Balay   /* backward solve the upper triangular */
112328e30874SSatish Balay   for (i=n-1; i>=0; i--) {
112428e30874SSatish Balay     v   = aa + 9*diag[i] + 9;
112528e30874SSatish Balay     vi  = aj + diag[i] + 1;
112628e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
112728e30874SSatish Balay     idt = 3*i;
112828e30874SSatish Balay     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
112928e30874SSatish Balay     while (nz--) {
113028e30874SSatish Balay       idx = 3*(*vi++);
113128e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
113228e30874SSatish Balay       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
113328e30874SSatish Balay       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
113428e30874SSatish Balay       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
113528e30874SSatish Balay       v  += 9;
113628e30874SSatish Balay     }
113728e30874SSatish Balay     idc      = 3*(*c--);
113828e30874SSatish Balay     v        = aa + 9*diag[i];
113928e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
114028e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
114128e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
114228e30874SSatish Balay   }
1143*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
1144*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
1145*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
1146*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
1147*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n));
114828e30874SSatish Balay   PetscFunctionReturn(0);
114928e30874SSatish Balay }
115028e30874SSatish Balay 
115128e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
115228e30874SSatish Balay {
115328e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
115428e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
115528e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
115628e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc,m;
115728e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
115828e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
115928e30874SSatish Balay   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
116028e30874SSatish Balay   const PetscScalar *b;
116128e30874SSatish Balay 
116228e30874SSatish Balay   PetscFunctionBegin;
1163*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
1164*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
116528e30874SSatish Balay   t    = a->solve_work;
116628e30874SSatish Balay 
1167*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
1168*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout;
116928e30874SSatish Balay 
117028e30874SSatish Balay   /* forward solve the lower triangular */
117128e30874SSatish Balay   idx  = 3*r[0];
117228e30874SSatish Balay   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
117328e30874SSatish Balay   for (i=1; i<n; i++) {
117428e30874SSatish Balay     v   = aa + 9*ai[i];
117528e30874SSatish Balay     vi  = aj + ai[i];
117628e30874SSatish Balay     nz  = ai[i+1] - ai[i];
117728e30874SSatish Balay     idx = 3*r[i];
117828e30874SSatish Balay     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
117928e30874SSatish Balay     for (m=0; m<nz; m++) {
118028e30874SSatish Balay       idx = 3*vi[m];
118128e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
118228e30874SSatish Balay       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
118328e30874SSatish Balay       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
118428e30874SSatish Balay       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
118528e30874SSatish Balay       v  += 9;
118628e30874SSatish Balay     }
118728e30874SSatish Balay     idx    = 3*i;
118828e30874SSatish Balay     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
118928e30874SSatish Balay   }
119028e30874SSatish Balay   /* backward solve the upper triangular */
119128e30874SSatish Balay   for (i=n-1; i>=0; i--) {
119228e30874SSatish Balay     v   = aa + 9*(adiag[i+1]+1);
119328e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
119428e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
119528e30874SSatish Balay     idt = 3*i;
119628e30874SSatish Balay     s1  = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
119728e30874SSatish Balay     for (m=0; m<nz; m++) {
119828e30874SSatish Balay       idx = 3*vi[m];
119928e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
120028e30874SSatish Balay       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
120128e30874SSatish Balay       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
120228e30874SSatish Balay       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
120328e30874SSatish Balay       v  += 9;
120428e30874SSatish Balay     }
120528e30874SSatish Balay     idc      = 3*c[i];
120628e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
120728e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
120828e30874SSatish Balay     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
120928e30874SSatish Balay   }
1210*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
1211*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
1212*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
1213*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
1214*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n));
121528e30874SSatish Balay   PetscFunctionReturn(0);
121628e30874SSatish Balay }
121728e30874SSatish Balay 
121828e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
121928e30874SSatish Balay {
122028e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
122128e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
122228e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
122328e30874SSatish Balay   PetscInt          i,nz,idx,idt,idc;
122428e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
122528e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
122628e30874SSatish Balay   PetscScalar       *x,s1,s2,x1,x2,*t;
122728e30874SSatish Balay   const PetscScalar *b;
122828e30874SSatish Balay 
122928e30874SSatish Balay   PetscFunctionBegin;
1230*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
1231*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
123228e30874SSatish Balay   t    = a->solve_work;
123328e30874SSatish Balay 
1234*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
1235*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1);
123628e30874SSatish Balay 
123728e30874SSatish Balay   /* forward solve the lower triangular */
123828e30874SSatish Balay   idx  = 2*(*r++);
123928e30874SSatish Balay   t[0] = b[idx]; t[1] = b[1+idx];
124028e30874SSatish Balay   for (i=1; i<n; i++) {
124128e30874SSatish Balay     v   = aa + 4*ai[i];
124228e30874SSatish Balay     vi  = aj + ai[i];
124328e30874SSatish Balay     nz  = diag[i] - ai[i];
124428e30874SSatish Balay     idx = 2*(*r++);
124528e30874SSatish Balay     s1  = b[idx]; s2 = b[1+idx];
124628e30874SSatish Balay     while (nz--) {
124728e30874SSatish Balay       idx = 2*(*vi++);
124828e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx];
124928e30874SSatish Balay       s1 -= v[0]*x1 + v[2]*x2;
125028e30874SSatish Balay       s2 -= v[1]*x1 + v[3]*x2;
125128e30874SSatish Balay       v  += 4;
125228e30874SSatish Balay     }
125328e30874SSatish Balay     idx    = 2*i;
125428e30874SSatish Balay     t[idx] = s1; t[1+idx] = s2;
125528e30874SSatish Balay   }
125628e30874SSatish Balay   /* backward solve the upper triangular */
125728e30874SSatish Balay   for (i=n-1; i>=0; i--) {
125828e30874SSatish Balay     v   = aa + 4*diag[i] + 4;
125928e30874SSatish Balay     vi  = aj + diag[i] + 1;
126028e30874SSatish Balay     nz  = ai[i+1] - diag[i] - 1;
126128e30874SSatish Balay     idt = 2*i;
126228e30874SSatish Balay     s1  = t[idt]; s2 = t[1+idt];
126328e30874SSatish Balay     while (nz--) {
126428e30874SSatish Balay       idx = 2*(*vi++);
126528e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx];
126628e30874SSatish Balay       s1 -= v[0]*x1 + v[2]*x2;
126728e30874SSatish Balay       s2 -= v[1]*x1 + v[3]*x2;
126828e30874SSatish Balay       v  += 4;
126928e30874SSatish Balay     }
127028e30874SSatish Balay     idc      = 2*(*c--);
127128e30874SSatish Balay     v        = aa + 4*diag[i];
127228e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
127328e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
127428e30874SSatish Balay   }
1275*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
1276*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
1277*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
1278*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
1279*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n));
128028e30874SSatish Balay   PetscFunctionReturn(0);
128128e30874SSatish Balay }
128228e30874SSatish Balay 
128328e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
128428e30874SSatish Balay {
128528e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
128628e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
128728e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
128828e30874SSatish Balay   PetscInt          i,nz,idx,jdx,idt,idc,m;
128928e30874SSatish Balay   const PetscInt    *r,*c,*rout,*cout;
129028e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
129128e30874SSatish Balay   PetscScalar       *x,s1,s2,x1,x2,*t;
129228e30874SSatish Balay   const PetscScalar *b;
129328e30874SSatish Balay 
129428e30874SSatish Balay   PetscFunctionBegin;
1295*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
1296*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
129728e30874SSatish Balay   t    = a->solve_work;
129828e30874SSatish Balay 
1299*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
1300*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout;
130128e30874SSatish Balay 
130228e30874SSatish Balay   /* forward solve the lower triangular */
130328e30874SSatish Balay   idx  = 2*r[0];
130428e30874SSatish Balay   t[0] = b[idx]; t[1] = b[1+idx];
130528e30874SSatish Balay   for (i=1; i<n; i++) {
130628e30874SSatish Balay     v   = aa + 4*ai[i];
130728e30874SSatish Balay     vi  = aj + ai[i];
130828e30874SSatish Balay     nz  = ai[i+1] - ai[i];
130928e30874SSatish Balay     idx = 2*r[i];
131028e30874SSatish Balay     s1  = b[idx]; s2 = b[1+idx];
131128e30874SSatish Balay     for (m=0; m<nz; m++) {
131228e30874SSatish Balay       jdx = 2*vi[m];
131328e30874SSatish Balay       x1  = t[jdx]; x2 = t[1+jdx];
131428e30874SSatish Balay       s1 -= v[0]*x1 + v[2]*x2;
131528e30874SSatish Balay       s2 -= v[1]*x1 + v[3]*x2;
131628e30874SSatish Balay       v  += 4;
131728e30874SSatish Balay     }
131828e30874SSatish Balay     idx    = 2*i;
131928e30874SSatish Balay     t[idx] = s1; t[1+idx] = s2;
132028e30874SSatish Balay   }
132128e30874SSatish Balay   /* backward solve the upper triangular */
132228e30874SSatish Balay   for (i=n-1; i>=0; i--) {
132328e30874SSatish Balay     v   = aa + 4*(adiag[i+1]+1);
132428e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
132528e30874SSatish Balay     nz  = adiag[i] - adiag[i+1] - 1;
132628e30874SSatish Balay     idt = 2*i;
132728e30874SSatish Balay     s1  = t[idt]; s2 = t[1+idt];
132828e30874SSatish Balay     for (m=0; m<nz; m++) {
132928e30874SSatish Balay       idx = 2*vi[m];
133028e30874SSatish Balay       x1  = t[idx]; x2 = t[1+idx];
133128e30874SSatish Balay       s1 -= v[0]*x1 + v[2]*x2;
133228e30874SSatish Balay       s2 -= v[1]*x1 + v[3]*x2;
133328e30874SSatish Balay       v  += 4;
133428e30874SSatish Balay     }
133528e30874SSatish Balay     idc      = 2*c[i];
133628e30874SSatish Balay     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
133728e30874SSatish Balay     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
133828e30874SSatish Balay   }
1339*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
1340*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
1341*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
1342*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
1343*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n));
134428e30874SSatish Balay   PetscFunctionReturn(0);
134528e30874SSatish Balay }
134628e30874SSatish Balay 
134728e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
134828e30874SSatish Balay {
134928e30874SSatish Balay   Mat_SeqBAIJ       *a   =(Mat_SeqBAIJ*)A->data;
135028e30874SSatish Balay   IS                iscol=a->col,isrow=a->row;
135128e30874SSatish Balay   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
135228e30874SSatish Balay   PetscInt          i,nz;
135328e30874SSatish Balay   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
135428e30874SSatish Balay   const MatScalar   *aa=a->a,*v;
135528e30874SSatish Balay   PetscScalar       *x,s1,*t;
135628e30874SSatish Balay   const PetscScalar *b;
135728e30874SSatish Balay 
135828e30874SSatish Balay   PetscFunctionBegin;
135928e30874SSatish Balay   if (!n) PetscFunctionReturn(0);
136028e30874SSatish Balay 
1361*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
1362*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
136328e30874SSatish Balay   t    = a->solve_work;
136428e30874SSatish Balay 
1365*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
1366*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout + (n-1);
136728e30874SSatish Balay 
136828e30874SSatish Balay   /* forward solve the lower triangular */
136928e30874SSatish Balay   t[0] = b[*r++];
137028e30874SSatish Balay   for (i=1; i<n; i++) {
137128e30874SSatish Balay     v  = aa + ai[i];
137228e30874SSatish Balay     vi = aj + ai[i];
137328e30874SSatish Balay     nz = diag[i] - ai[i];
137428e30874SSatish Balay     s1 = b[*r++];
137528e30874SSatish Balay     while (nz--) {
137628e30874SSatish Balay       s1 -= (*v++)*t[*vi++];
137728e30874SSatish Balay     }
137828e30874SSatish Balay     t[i] = s1;
137928e30874SSatish Balay   }
138028e30874SSatish Balay   /* backward solve the upper triangular */
138128e30874SSatish Balay   for (i=n-1; i>=0; i--) {
138228e30874SSatish Balay     v  = aa + diag[i] + 1;
138328e30874SSatish Balay     vi = aj + diag[i] + 1;
138428e30874SSatish Balay     nz = ai[i+1] - diag[i] - 1;
138528e30874SSatish Balay     s1 = t[i];
138628e30874SSatish Balay     while (nz--) {
138728e30874SSatish Balay       s1 -= (*v++)*t[*vi++];
138828e30874SSatish Balay     }
138928e30874SSatish Balay     x[*c--] = t[i] = aa[diag[i]]*s1;
139028e30874SSatish Balay   }
139128e30874SSatish Balay 
1392*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
1393*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
1394*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
1395*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
1396*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*1*(a->nz) - A->cmap->n));
139728e30874SSatish Balay   PetscFunctionReturn(0);
139828e30874SSatish Balay }
139928e30874SSatish Balay 
140028e30874SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
140128e30874SSatish Balay {
140228e30874SSatish Balay   Mat_SeqBAIJ       *a    = (Mat_SeqBAIJ*)A->data;
140328e30874SSatish Balay   IS                iscol = a->col,isrow = a->row;
140428e30874SSatish Balay   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
140528e30874SSatish Balay   const PetscInt    *rout,*cout,*r,*c;
140628e30874SSatish Balay   PetscScalar       *x,*tmp,sum;
140728e30874SSatish Balay   const PetscScalar *b;
140828e30874SSatish Balay   const MatScalar   *aa = a->a,*v;
140928e30874SSatish Balay 
141028e30874SSatish Balay   PetscFunctionBegin;
141128e30874SSatish Balay   if (!n) PetscFunctionReturn(0);
141228e30874SSatish Balay 
1413*9566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb,&b));
1414*9566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx,&x));
141528e30874SSatish Balay   tmp  = a->solve_work;
141628e30874SSatish Balay 
1417*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(isrow,&rout)); r = rout;
1418*9566063dSJacob Faibussowitsch   PetscCall(ISGetIndices(iscol,&cout)); c = cout;
141928e30874SSatish Balay 
142028e30874SSatish Balay   /* forward solve the lower triangular */
142128e30874SSatish Balay   tmp[0] = b[r[0]];
142228e30874SSatish Balay   v      = aa;
142328e30874SSatish Balay   vi     = aj;
142428e30874SSatish Balay   for (i=1; i<n; i++) {
142528e30874SSatish Balay     nz  = ai[i+1] - ai[i];
142628e30874SSatish Balay     sum = b[r[i]];
142728e30874SSatish Balay     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
142828e30874SSatish Balay     tmp[i] = sum;
142928e30874SSatish Balay     v     += nz; vi += nz;
143028e30874SSatish Balay   }
143128e30874SSatish Balay 
143228e30874SSatish Balay   /* backward solve the upper triangular */
143328e30874SSatish Balay   for (i=n-1; i>=0; i--) {
143428e30874SSatish Balay     v   = aa + adiag[i+1]+1;
143528e30874SSatish Balay     vi  = aj + adiag[i+1]+1;
143628e30874SSatish Balay     nz  = adiag[i]-adiag[i+1]-1;
143728e30874SSatish Balay     sum = tmp[i];
143828e30874SSatish Balay     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
143928e30874SSatish Balay     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
144028e30874SSatish Balay   }
144128e30874SSatish Balay 
1442*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow,&rout));
1443*9566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol,&cout));
1444*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb,&b));
1445*9566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx,&x));
1446*9566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0*a->nz - A->cmap->n));
144728e30874SSatish Balay   PetscFunctionReturn(0);
144828e30874SSatish Balay }
1449