xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision 9371c9d470a9602b6d10a8bf50c9b2280a79e45a)
128e30874SSatish Balay #include <../src/mat/impls/baij/seq/baij.h>
2af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
328e30874SSatish Balay 
4*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx) {
528e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
628e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
728e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
828e30874SSatish Balay   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi;
928e30874SSatish Balay   PetscInt           i, nz;
1028e30874SSatish Balay   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
1128e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
1228e30874SSatish Balay   PetscScalar       *x, *s, *t, *ls;
1328e30874SSatish Balay   const PetscScalar *b;
1428e30874SSatish Balay 
1528e30874SSatish Balay   PetscFunctionBegin;
169566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
179566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
1828e30874SSatish Balay   t = a->solve_work;
1928e30874SSatish Balay 
20*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
21*9371c9d4SSatish Balay   r = rout;
22*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
23*9371c9d4SSatish Balay   c = cout + (n - 1);
2428e30874SSatish Balay 
2528e30874SSatish Balay   /* forward solve the lower triangular */
269566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
2728e30874SSatish Balay   for (i = 1; i < n; i++) {
2828e30874SSatish Balay     v  = aa + bs2 * ai[i];
2928e30874SSatish Balay     vi = aj + ai[i];
3028e30874SSatish Balay     nz = a->diag[i] - ai[i];
3128e30874SSatish Balay     s  = t + bs * i;
329566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
3328e30874SSatish Balay     while (nz--) {
3428e30874SSatish Balay       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
3528e30874SSatish Balay       v += bs2;
3628e30874SSatish Balay     }
3728e30874SSatish Balay   }
3828e30874SSatish Balay   /* backward solve the upper triangular */
3928e30874SSatish Balay   ls = a->solve_work + A->cmap->n;
4028e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
4128e30874SSatish Balay     v  = aa + bs2 * (a->diag[i] + 1);
4228e30874SSatish Balay     vi = aj + a->diag[i] + 1;
4328e30874SSatish Balay     nz = ai[i + 1] - a->diag[i] - 1;
449566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
4528e30874SSatish Balay     while (nz--) {
4628e30874SSatish Balay       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
4728e30874SSatish Balay       v += bs2;
4828e30874SSatish Balay     }
4928e30874SSatish Balay     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
509566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
5128e30874SSatish Balay   }
5228e30874SSatish Balay 
539566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
549566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
559566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
569566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
579566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
5828e30874SSatish Balay   PetscFunctionReturn(0);
5928e30874SSatish Balay }
6028e30874SSatish Balay 
61*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx) {
6228e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
6328e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
6428e30874SSatish Balay   const PetscInt    *r, *c, *ai = a->i, *aj = a->j;
6528e30874SSatish Balay   const PetscInt    *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
6628e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
6728e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
6828e30874SSatish Balay   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
6928e30874SSatish Balay   const PetscScalar *b;
7028e30874SSatish Balay 
7128e30874SSatish Balay   PetscFunctionBegin;
729566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
739566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
7428e30874SSatish Balay   t = a->solve_work;
7528e30874SSatish Balay 
76*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
77*9371c9d4SSatish Balay   r = rout;
78*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
79*9371c9d4SSatish Balay   c = cout + (n - 1);
8028e30874SSatish Balay 
8128e30874SSatish Balay   /* forward solve the lower triangular */
8228e30874SSatish Balay   idx  = 7 * (*r++);
83*9371c9d4SSatish Balay   t[0] = b[idx];
84*9371c9d4SSatish Balay   t[1] = b[1 + idx];
85*9371c9d4SSatish Balay   t[2] = b[2 + idx];
86*9371c9d4SSatish Balay   t[3] = b[3 + idx];
87*9371c9d4SSatish Balay   t[4] = b[4 + idx];
88*9371c9d4SSatish Balay   t[5] = b[5 + idx];
89*9371c9d4SSatish Balay   t[6] = b[6 + idx];
9028e30874SSatish Balay 
9128e30874SSatish Balay   for (i = 1; i < n; i++) {
9228e30874SSatish Balay     v   = aa + 49 * ai[i];
9328e30874SSatish Balay     vi  = aj + ai[i];
9428e30874SSatish Balay     nz  = diag[i] - ai[i];
9528e30874SSatish Balay     idx = 7 * (*r++);
96*9371c9d4SSatish Balay     s1  = b[idx];
97*9371c9d4SSatish Balay     s2  = b[1 + idx];
98*9371c9d4SSatish Balay     s3  = b[2 + idx];
99*9371c9d4SSatish Balay     s4  = b[3 + idx];
100*9371c9d4SSatish Balay     s5  = b[4 + idx];
101*9371c9d4SSatish Balay     s6  = b[5 + idx];
102*9371c9d4SSatish Balay     s7  = b[6 + idx];
10328e30874SSatish Balay     while (nz--) {
10428e30874SSatish Balay       idx = 7 * (*vi++);
105*9371c9d4SSatish Balay       x1  = t[idx];
106*9371c9d4SSatish Balay       x2  = t[1 + idx];
107*9371c9d4SSatish Balay       x3  = t[2 + idx];
108*9371c9d4SSatish Balay       x4  = t[3 + idx];
109*9371c9d4SSatish Balay       x5  = t[4 + idx];
110*9371c9d4SSatish Balay       x6  = t[5 + idx];
111*9371c9d4SSatish Balay       x7  = t[6 + idx];
11228e30874SSatish Balay       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
11328e30874SSatish Balay       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
11428e30874SSatish Balay       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
11528e30874SSatish Balay       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
11628e30874SSatish Balay       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
11728e30874SSatish Balay       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
11828e30874SSatish Balay       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
11928e30874SSatish Balay       v += 49;
12028e30874SSatish Balay     }
12128e30874SSatish Balay     idx        = 7 * i;
122*9371c9d4SSatish Balay     t[idx]     = s1;
123*9371c9d4SSatish Balay     t[1 + idx] = s2;
124*9371c9d4SSatish Balay     t[2 + idx] = s3;
125*9371c9d4SSatish Balay     t[3 + idx] = s4;
126*9371c9d4SSatish Balay     t[4 + idx] = s5;
127*9371c9d4SSatish Balay     t[5 + idx] = s6;
128*9371c9d4SSatish Balay     t[6 + idx] = s7;
12928e30874SSatish Balay   }
13028e30874SSatish Balay   /* backward solve the upper triangular */
13128e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
13228e30874SSatish Balay     v   = aa + 49 * diag[i] + 49;
13328e30874SSatish Balay     vi  = aj + diag[i] + 1;
13428e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
13528e30874SSatish Balay     idt = 7 * i;
136*9371c9d4SSatish Balay     s1  = t[idt];
137*9371c9d4SSatish Balay     s2  = t[1 + idt];
138*9371c9d4SSatish Balay     s3  = t[2 + idt];
139*9371c9d4SSatish Balay     s4  = t[3 + idt];
140*9371c9d4SSatish Balay     s5  = t[4 + idt];
141*9371c9d4SSatish Balay     s6  = t[5 + idt];
142*9371c9d4SSatish Balay     s7  = t[6 + idt];
14328e30874SSatish Balay     while (nz--) {
14428e30874SSatish Balay       idx = 7 * (*vi++);
145*9371c9d4SSatish Balay       x1  = t[idx];
146*9371c9d4SSatish Balay       x2  = t[1 + idx];
147*9371c9d4SSatish Balay       x3  = t[2 + idx];
148*9371c9d4SSatish Balay       x4  = t[3 + idx];
149*9371c9d4SSatish Balay       x5  = t[4 + idx];
150*9371c9d4SSatish Balay       x6  = t[5 + idx];
151*9371c9d4SSatish Balay       x7  = t[6 + idx];
15228e30874SSatish Balay       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
15328e30874SSatish Balay       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
15428e30874SSatish Balay       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
15528e30874SSatish Balay       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
15628e30874SSatish Balay       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
15728e30874SSatish Balay       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
15828e30874SSatish Balay       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
15928e30874SSatish Balay       v += 49;
16028e30874SSatish Balay     }
16128e30874SSatish Balay     idc    = 7 * (*c--);
16228e30874SSatish Balay     v      = aa + 49 * diag[i];
163*9371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
164*9371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
165*9371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
166*9371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
167*9371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
168*9371c9d4SSatish Balay     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
169*9371c9d4SSatish Balay     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
17028e30874SSatish Balay   }
17128e30874SSatish Balay 
1729566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
1739566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
1749566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
1759566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
1769566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
17728e30874SSatish Balay   PetscFunctionReturn(0);
17828e30874SSatish Balay }
17928e30874SSatish Balay 
180*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx) {
18128e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
18228e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
18328e30874SSatish Balay   const PetscInt    *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
18428e30874SSatish Balay   const PetscInt     n = a->mbs, *rout, *cout, *vi;
18528e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
18628e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
18728e30874SSatish Balay   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
18828e30874SSatish Balay   const PetscScalar *b;
18928e30874SSatish Balay 
19028e30874SSatish Balay   PetscFunctionBegin;
1919566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
1929566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
19328e30874SSatish Balay   t = a->solve_work;
19428e30874SSatish Balay 
195*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
196*9371c9d4SSatish Balay   r = rout;
197*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
198*9371c9d4SSatish Balay   c = cout;
19928e30874SSatish Balay 
20028e30874SSatish Balay   /* forward solve the lower triangular */
20128e30874SSatish Balay   idx  = 7 * r[0];
202*9371c9d4SSatish Balay   t[0] = b[idx];
203*9371c9d4SSatish Balay   t[1] = b[1 + idx];
204*9371c9d4SSatish Balay   t[2] = b[2 + idx];
205*9371c9d4SSatish Balay   t[3] = b[3 + idx];
206*9371c9d4SSatish Balay   t[4] = b[4 + idx];
207*9371c9d4SSatish Balay   t[5] = b[5 + idx];
208*9371c9d4SSatish Balay   t[6] = b[6 + idx];
20928e30874SSatish Balay 
21028e30874SSatish Balay   for (i = 1; i < n; i++) {
21128e30874SSatish Balay     v   = aa + 49 * ai[i];
21228e30874SSatish Balay     vi  = aj + ai[i];
21328e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
21428e30874SSatish Balay     idx = 7 * r[i];
215*9371c9d4SSatish Balay     s1  = b[idx];
216*9371c9d4SSatish Balay     s2  = b[1 + idx];
217*9371c9d4SSatish Balay     s3  = b[2 + idx];
218*9371c9d4SSatish Balay     s4  = b[3 + idx];
219*9371c9d4SSatish Balay     s5  = b[4 + idx];
220*9371c9d4SSatish Balay     s6  = b[5 + idx];
221*9371c9d4SSatish Balay     s7  = b[6 + idx];
22228e30874SSatish Balay     for (m = 0; m < nz; m++) {
22328e30874SSatish Balay       idx = 7 * vi[m];
224*9371c9d4SSatish Balay       x1  = t[idx];
225*9371c9d4SSatish Balay       x2  = t[1 + idx];
226*9371c9d4SSatish Balay       x3  = t[2 + idx];
227*9371c9d4SSatish Balay       x4  = t[3 + idx];
228*9371c9d4SSatish Balay       x5  = t[4 + idx];
229*9371c9d4SSatish Balay       x6  = t[5 + idx];
230*9371c9d4SSatish Balay       x7  = t[6 + idx];
23128e30874SSatish Balay       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
23228e30874SSatish Balay       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
23328e30874SSatish Balay       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
23428e30874SSatish Balay       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
23528e30874SSatish Balay       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
23628e30874SSatish Balay       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
23728e30874SSatish Balay       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
23828e30874SSatish Balay       v += 49;
23928e30874SSatish Balay     }
24028e30874SSatish Balay     idx        = 7 * i;
241*9371c9d4SSatish Balay     t[idx]     = s1;
242*9371c9d4SSatish Balay     t[1 + idx] = s2;
243*9371c9d4SSatish Balay     t[2 + idx] = s3;
244*9371c9d4SSatish Balay     t[3 + idx] = s4;
245*9371c9d4SSatish Balay     t[4 + idx] = s5;
246*9371c9d4SSatish Balay     t[5 + idx] = s6;
247*9371c9d4SSatish Balay     t[6 + idx] = s7;
24828e30874SSatish Balay   }
24928e30874SSatish Balay   /* backward solve the upper triangular */
25028e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
25128e30874SSatish Balay     v   = aa + 49 * (adiag[i + 1] + 1);
25228e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
25328e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
25428e30874SSatish Balay     idt = 7 * i;
255*9371c9d4SSatish Balay     s1  = t[idt];
256*9371c9d4SSatish Balay     s2  = t[1 + idt];
257*9371c9d4SSatish Balay     s3  = t[2 + idt];
258*9371c9d4SSatish Balay     s4  = t[3 + idt];
259*9371c9d4SSatish Balay     s5  = t[4 + idt];
260*9371c9d4SSatish Balay     s6  = t[5 + idt];
261*9371c9d4SSatish Balay     s7  = t[6 + idt];
26228e30874SSatish Balay     for (m = 0; m < nz; m++) {
26328e30874SSatish Balay       idx = 7 * vi[m];
264*9371c9d4SSatish Balay       x1  = t[idx];
265*9371c9d4SSatish Balay       x2  = t[1 + idx];
266*9371c9d4SSatish Balay       x3  = t[2 + idx];
267*9371c9d4SSatish Balay       x4  = t[3 + idx];
268*9371c9d4SSatish Balay       x5  = t[4 + idx];
269*9371c9d4SSatish Balay       x6  = t[5 + idx];
270*9371c9d4SSatish Balay       x7  = t[6 + idx];
27128e30874SSatish Balay       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
27228e30874SSatish Balay       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
27328e30874SSatish Balay       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
27428e30874SSatish Balay       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
27528e30874SSatish Balay       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
27628e30874SSatish Balay       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
27728e30874SSatish Balay       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
27828e30874SSatish Balay       v += 49;
27928e30874SSatish Balay     }
28028e30874SSatish Balay     idc    = 7 * c[i];
281*9371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
282*9371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
283*9371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
284*9371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
285*9371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
286*9371c9d4SSatish Balay     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
287*9371c9d4SSatish Balay     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
28828e30874SSatish Balay   }
28928e30874SSatish Balay 
2909566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
2919566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
2929566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
2939566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
2949566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
29528e30874SSatish Balay   PetscFunctionReturn(0);
29628e30874SSatish Balay }
29728e30874SSatish Balay 
298*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx) {
29928e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
30028e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
30128e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
30228e30874SSatish Balay   const PetscInt    *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
30328e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
30428e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
30528e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
30628e30874SSatish Balay   const PetscScalar *b;
30728e30874SSatish Balay 
30828e30874SSatish Balay   PetscFunctionBegin;
3099566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
3109566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
31128e30874SSatish Balay   t = a->solve_work;
31228e30874SSatish Balay 
313*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
314*9371c9d4SSatish Balay   r = rout;
315*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
316*9371c9d4SSatish Balay   c = cout + (n - 1);
31728e30874SSatish Balay 
31828e30874SSatish Balay   /* forward solve the lower triangular */
31928e30874SSatish Balay   idx  = 6 * (*r++);
320*9371c9d4SSatish Balay   t[0] = b[idx];
321*9371c9d4SSatish Balay   t[1] = b[1 + idx];
322*9371c9d4SSatish Balay   t[2] = b[2 + idx];
323*9371c9d4SSatish Balay   t[3] = b[3 + idx];
324*9371c9d4SSatish Balay   t[4] = b[4 + idx];
325*9371c9d4SSatish Balay   t[5] = b[5 + idx];
32628e30874SSatish Balay   for (i = 1; i < n; i++) {
32728e30874SSatish Balay     v   = aa + 36 * ai[i];
32828e30874SSatish Balay     vi  = aj + ai[i];
32928e30874SSatish Balay     nz  = diag[i] - ai[i];
33028e30874SSatish Balay     idx = 6 * (*r++);
331*9371c9d4SSatish Balay     s1  = b[idx];
332*9371c9d4SSatish Balay     s2  = b[1 + idx];
333*9371c9d4SSatish Balay     s3  = b[2 + idx];
334*9371c9d4SSatish Balay     s4  = b[3 + idx];
335*9371c9d4SSatish Balay     s5  = b[4 + idx];
336*9371c9d4SSatish Balay     s6  = b[5 + idx];
33728e30874SSatish Balay     while (nz--) {
33828e30874SSatish Balay       idx = 6 * (*vi++);
339*9371c9d4SSatish Balay       x1  = t[idx];
340*9371c9d4SSatish Balay       x2  = t[1 + idx];
341*9371c9d4SSatish Balay       x3  = t[2 + idx];
342*9371c9d4SSatish Balay       x4  = t[3 + idx];
343*9371c9d4SSatish Balay       x5  = t[4 + idx];
344*9371c9d4SSatish Balay       x6  = t[5 + idx];
34528e30874SSatish Balay       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
34628e30874SSatish Balay       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
34728e30874SSatish Balay       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
34828e30874SSatish Balay       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
34928e30874SSatish Balay       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
35028e30874SSatish Balay       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
35128e30874SSatish Balay       v += 36;
35228e30874SSatish Balay     }
35328e30874SSatish Balay     idx        = 6 * i;
354*9371c9d4SSatish Balay     t[idx]     = s1;
355*9371c9d4SSatish Balay     t[1 + idx] = s2;
356*9371c9d4SSatish Balay     t[2 + idx] = s3;
357*9371c9d4SSatish Balay     t[3 + idx] = s4;
358*9371c9d4SSatish Balay     t[4 + idx] = s5;
359*9371c9d4SSatish Balay     t[5 + idx] = s6;
36028e30874SSatish Balay   }
36128e30874SSatish Balay   /* backward solve the upper triangular */
36228e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
36328e30874SSatish Balay     v   = aa + 36 * diag[i] + 36;
36428e30874SSatish Balay     vi  = aj + diag[i] + 1;
36528e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
36628e30874SSatish Balay     idt = 6 * i;
367*9371c9d4SSatish Balay     s1  = t[idt];
368*9371c9d4SSatish Balay     s2  = t[1 + idt];
369*9371c9d4SSatish Balay     s3  = t[2 + idt];
370*9371c9d4SSatish Balay     s4  = t[3 + idt];
371*9371c9d4SSatish Balay     s5  = t[4 + idt];
372*9371c9d4SSatish Balay     s6  = t[5 + idt];
37328e30874SSatish Balay     while (nz--) {
37428e30874SSatish Balay       idx = 6 * (*vi++);
375*9371c9d4SSatish Balay       x1  = t[idx];
376*9371c9d4SSatish Balay       x2  = t[1 + idx];
377*9371c9d4SSatish Balay       x3  = t[2 + idx];
378*9371c9d4SSatish Balay       x4  = t[3 + idx];
379*9371c9d4SSatish Balay       x5  = t[4 + idx];
380*9371c9d4SSatish Balay       x6  = t[5 + idx];
38128e30874SSatish Balay       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
38228e30874SSatish Balay       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
38328e30874SSatish Balay       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
38428e30874SSatish Balay       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
38528e30874SSatish Balay       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
38628e30874SSatish Balay       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
38728e30874SSatish Balay       v += 36;
38828e30874SSatish Balay     }
38928e30874SSatish Balay     idc    = 6 * (*c--);
39028e30874SSatish Balay     v      = aa + 36 * diag[i];
391*9371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
392*9371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
393*9371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
394*9371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
395*9371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
396*9371c9d4SSatish Balay     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
39728e30874SSatish Balay   }
39828e30874SSatish Balay 
3999566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
4009566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
4019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
4029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
4039566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
40428e30874SSatish Balay   PetscFunctionReturn(0);
40528e30874SSatish Balay }
40628e30874SSatish Balay 
407*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx) {
40828e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
40928e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
41028e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
41128e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
41228e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
41328e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
41428e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
41528e30874SSatish Balay   const PetscScalar *b;
41628e30874SSatish Balay 
41728e30874SSatish Balay   PetscFunctionBegin;
4189566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
4199566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
42028e30874SSatish Balay   t = a->solve_work;
42128e30874SSatish Balay 
422*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
423*9371c9d4SSatish Balay   r = rout;
424*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
425*9371c9d4SSatish Balay   c = cout;
42628e30874SSatish Balay 
42728e30874SSatish Balay   /* forward solve the lower triangular */
42828e30874SSatish Balay   idx  = 6 * r[0];
429*9371c9d4SSatish Balay   t[0] = b[idx];
430*9371c9d4SSatish Balay   t[1] = b[1 + idx];
431*9371c9d4SSatish Balay   t[2] = b[2 + idx];
432*9371c9d4SSatish Balay   t[3] = b[3 + idx];
433*9371c9d4SSatish Balay   t[4] = b[4 + idx];
434*9371c9d4SSatish Balay   t[5] = b[5 + idx];
43528e30874SSatish Balay   for (i = 1; i < n; i++) {
43628e30874SSatish Balay     v   = aa + 36 * ai[i];
43728e30874SSatish Balay     vi  = aj + ai[i];
43828e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
43928e30874SSatish Balay     idx = 6 * r[i];
440*9371c9d4SSatish Balay     s1  = b[idx];
441*9371c9d4SSatish Balay     s2  = b[1 + idx];
442*9371c9d4SSatish Balay     s3  = b[2 + idx];
443*9371c9d4SSatish Balay     s4  = b[3 + idx];
444*9371c9d4SSatish Balay     s5  = b[4 + idx];
445*9371c9d4SSatish Balay     s6  = b[5 + idx];
44628e30874SSatish Balay     for (m = 0; m < nz; m++) {
44728e30874SSatish Balay       idx = 6 * vi[m];
448*9371c9d4SSatish Balay       x1  = t[idx];
449*9371c9d4SSatish Balay       x2  = t[1 + idx];
450*9371c9d4SSatish Balay       x3  = t[2 + idx];
451*9371c9d4SSatish Balay       x4  = t[3 + idx];
452*9371c9d4SSatish Balay       x5  = t[4 + idx];
453*9371c9d4SSatish Balay       x6  = t[5 + idx];
45428e30874SSatish Balay       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
45528e30874SSatish Balay       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
45628e30874SSatish Balay       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
45728e30874SSatish Balay       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
45828e30874SSatish Balay       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
45928e30874SSatish Balay       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
46028e30874SSatish Balay       v += 36;
46128e30874SSatish Balay     }
46228e30874SSatish Balay     idx        = 6 * i;
463*9371c9d4SSatish Balay     t[idx]     = s1;
464*9371c9d4SSatish Balay     t[1 + idx] = s2;
465*9371c9d4SSatish Balay     t[2 + idx] = s3;
466*9371c9d4SSatish Balay     t[3 + idx] = s4;
467*9371c9d4SSatish Balay     t[4 + idx] = s5;
468*9371c9d4SSatish Balay     t[5 + idx] = s6;
46928e30874SSatish Balay   }
47028e30874SSatish Balay   /* backward solve the upper triangular */
47128e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
47228e30874SSatish Balay     v   = aa + 36 * (adiag[i + 1] + 1);
47328e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
47428e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
47528e30874SSatish Balay     idt = 6 * i;
476*9371c9d4SSatish Balay     s1  = t[idt];
477*9371c9d4SSatish Balay     s2  = t[1 + idt];
478*9371c9d4SSatish Balay     s3  = t[2 + idt];
479*9371c9d4SSatish Balay     s4  = t[3 + idt];
480*9371c9d4SSatish Balay     s5  = t[4 + idt];
481*9371c9d4SSatish Balay     s6  = t[5 + idt];
48228e30874SSatish Balay     for (m = 0; m < nz; m++) {
48328e30874SSatish Balay       idx = 6 * vi[m];
484*9371c9d4SSatish Balay       x1  = t[idx];
485*9371c9d4SSatish Balay       x2  = t[1 + idx];
486*9371c9d4SSatish Balay       x3  = t[2 + idx];
487*9371c9d4SSatish Balay       x4  = t[3 + idx];
488*9371c9d4SSatish Balay       x5  = t[4 + idx];
489*9371c9d4SSatish Balay       x6  = t[5 + idx];
49028e30874SSatish Balay       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
49128e30874SSatish Balay       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
49228e30874SSatish Balay       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
49328e30874SSatish Balay       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
49428e30874SSatish Balay       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
49528e30874SSatish Balay       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
49628e30874SSatish Balay       v += 36;
49728e30874SSatish Balay     }
49828e30874SSatish Balay     idc    = 6 * c[i];
499*9371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
500*9371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
501*9371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
502*9371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
503*9371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
504*9371c9d4SSatish Balay     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
50528e30874SSatish Balay   }
50628e30874SSatish Balay 
5079566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
5089566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
5099566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
5109566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
5119566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
51228e30874SSatish Balay   PetscFunctionReturn(0);
51328e30874SSatish Balay }
51428e30874SSatish Balay 
515*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx) {
51628e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
51728e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
51828e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout, *diag = a->diag;
51928e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
52028e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
52128e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
52228e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
52328e30874SSatish Balay   const PetscScalar *b;
52428e30874SSatish Balay 
52528e30874SSatish Balay   PetscFunctionBegin;
5269566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
5279566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
52828e30874SSatish Balay   t = a->solve_work;
52928e30874SSatish Balay 
530*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
531*9371c9d4SSatish Balay   r = rout;
532*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
533*9371c9d4SSatish Balay   c = cout + (n - 1);
53428e30874SSatish Balay 
53528e30874SSatish Balay   /* forward solve the lower triangular */
53628e30874SSatish Balay   idx  = 5 * (*r++);
537*9371c9d4SSatish Balay   t[0] = b[idx];
538*9371c9d4SSatish Balay   t[1] = b[1 + idx];
539*9371c9d4SSatish Balay   t[2] = b[2 + idx];
540*9371c9d4SSatish Balay   t[3] = b[3 + idx];
541*9371c9d4SSatish Balay   t[4] = b[4 + idx];
54228e30874SSatish Balay   for (i = 1; i < n; i++) {
54328e30874SSatish Balay     v   = aa + 25 * ai[i];
54428e30874SSatish Balay     vi  = aj + ai[i];
54528e30874SSatish Balay     nz  = diag[i] - ai[i];
54628e30874SSatish Balay     idx = 5 * (*r++);
547*9371c9d4SSatish Balay     s1  = b[idx];
548*9371c9d4SSatish Balay     s2  = b[1 + idx];
549*9371c9d4SSatish Balay     s3  = b[2 + idx];
550*9371c9d4SSatish Balay     s4  = b[3 + idx];
55128e30874SSatish Balay     s5  = b[4 + idx];
55228e30874SSatish Balay     while (nz--) {
55328e30874SSatish Balay       idx = 5 * (*vi++);
554*9371c9d4SSatish Balay       x1  = t[idx];
555*9371c9d4SSatish Balay       x2  = t[1 + idx];
556*9371c9d4SSatish Balay       x3  = t[2 + idx];
557*9371c9d4SSatish Balay       x4  = t[3 + idx];
558*9371c9d4SSatish Balay       x5  = t[4 + idx];
55928e30874SSatish Balay       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
56028e30874SSatish Balay       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
56128e30874SSatish Balay       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
56228e30874SSatish Balay       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
56328e30874SSatish Balay       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
56428e30874SSatish Balay       v += 25;
56528e30874SSatish Balay     }
56628e30874SSatish Balay     idx        = 5 * i;
567*9371c9d4SSatish Balay     t[idx]     = s1;
568*9371c9d4SSatish Balay     t[1 + idx] = s2;
569*9371c9d4SSatish Balay     t[2 + idx] = s3;
570*9371c9d4SSatish Balay     t[3 + idx] = s4;
571*9371c9d4SSatish Balay     t[4 + idx] = s5;
57228e30874SSatish Balay   }
57328e30874SSatish Balay   /* backward solve the upper triangular */
57428e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
57528e30874SSatish Balay     v   = aa + 25 * diag[i] + 25;
57628e30874SSatish Balay     vi  = aj + diag[i] + 1;
57728e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
57828e30874SSatish Balay     idt = 5 * i;
579*9371c9d4SSatish Balay     s1  = t[idt];
580*9371c9d4SSatish Balay     s2  = t[1 + idt];
581*9371c9d4SSatish Balay     s3  = t[2 + idt];
582*9371c9d4SSatish Balay     s4  = t[3 + idt];
583*9371c9d4SSatish Balay     s5  = t[4 + idt];
58428e30874SSatish Balay     while (nz--) {
58528e30874SSatish Balay       idx = 5 * (*vi++);
586*9371c9d4SSatish Balay       x1  = t[idx];
587*9371c9d4SSatish Balay       x2  = t[1 + idx];
588*9371c9d4SSatish Balay       x3  = t[2 + idx];
589*9371c9d4SSatish Balay       x4  = t[3 + idx];
590*9371c9d4SSatish Balay       x5  = t[4 + idx];
59128e30874SSatish Balay       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
59228e30874SSatish Balay       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
59328e30874SSatish Balay       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
59428e30874SSatish Balay       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
59528e30874SSatish Balay       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
59628e30874SSatish Balay       v += 25;
59728e30874SSatish Balay     }
59828e30874SSatish Balay     idc    = 5 * (*c--);
59928e30874SSatish Balay     v      = aa + 25 * diag[i];
600*9371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
601*9371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
602*9371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
603*9371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
604*9371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
60528e30874SSatish Balay   }
60628e30874SSatish Balay 
6079566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
6089566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
6099566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
6109566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
6119566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
61228e30874SSatish Balay   PetscFunctionReturn(0);
61328e30874SSatish Balay }
61428e30874SSatish Balay 
615*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx) {
61628e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
61728e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
61828e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
61928e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
62028e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
62128e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
62228e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
62328e30874SSatish Balay   const PetscScalar *b;
62428e30874SSatish Balay 
62528e30874SSatish Balay   PetscFunctionBegin;
6269566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
6279566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
62828e30874SSatish Balay   t = a->solve_work;
62928e30874SSatish Balay 
630*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
631*9371c9d4SSatish Balay   r = rout;
632*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
633*9371c9d4SSatish Balay   c = cout;
63428e30874SSatish Balay 
63528e30874SSatish Balay   /* forward solve the lower triangular */
63628e30874SSatish Balay   idx  = 5 * r[0];
637*9371c9d4SSatish Balay   t[0] = b[idx];
638*9371c9d4SSatish Balay   t[1] = b[1 + idx];
639*9371c9d4SSatish Balay   t[2] = b[2 + idx];
640*9371c9d4SSatish Balay   t[3] = b[3 + idx];
641*9371c9d4SSatish Balay   t[4] = b[4 + idx];
64228e30874SSatish Balay   for (i = 1; i < n; i++) {
64328e30874SSatish Balay     v   = aa + 25 * ai[i];
64428e30874SSatish Balay     vi  = aj + ai[i];
64528e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
64628e30874SSatish Balay     idx = 5 * r[i];
647*9371c9d4SSatish Balay     s1  = b[idx];
648*9371c9d4SSatish Balay     s2  = b[1 + idx];
649*9371c9d4SSatish Balay     s3  = b[2 + idx];
650*9371c9d4SSatish Balay     s4  = b[3 + idx];
65128e30874SSatish Balay     s5  = b[4 + idx];
65228e30874SSatish Balay     for (m = 0; m < nz; m++) {
65328e30874SSatish Balay       idx = 5 * vi[m];
654*9371c9d4SSatish Balay       x1  = t[idx];
655*9371c9d4SSatish Balay       x2  = t[1 + idx];
656*9371c9d4SSatish Balay       x3  = t[2 + idx];
657*9371c9d4SSatish Balay       x4  = t[3 + idx];
658*9371c9d4SSatish Balay       x5  = t[4 + idx];
65928e30874SSatish Balay       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
66028e30874SSatish Balay       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
66128e30874SSatish Balay       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
66228e30874SSatish Balay       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
66328e30874SSatish Balay       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
66428e30874SSatish Balay       v += 25;
66528e30874SSatish Balay     }
66628e30874SSatish Balay     idx        = 5 * i;
667*9371c9d4SSatish Balay     t[idx]     = s1;
668*9371c9d4SSatish Balay     t[1 + idx] = s2;
669*9371c9d4SSatish Balay     t[2 + idx] = s3;
670*9371c9d4SSatish Balay     t[3 + idx] = s4;
671*9371c9d4SSatish Balay     t[4 + idx] = s5;
67228e30874SSatish Balay   }
67328e30874SSatish Balay   /* backward solve the upper triangular */
67428e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
67528e30874SSatish Balay     v   = aa + 25 * (adiag[i + 1] + 1);
67628e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
67728e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
67828e30874SSatish Balay     idt = 5 * i;
679*9371c9d4SSatish Balay     s1  = t[idt];
680*9371c9d4SSatish Balay     s2  = t[1 + idt];
681*9371c9d4SSatish Balay     s3  = t[2 + idt];
682*9371c9d4SSatish Balay     s4  = t[3 + idt];
683*9371c9d4SSatish Balay     s5  = t[4 + idt];
68428e30874SSatish Balay     for (m = 0; m < nz; m++) {
68528e30874SSatish Balay       idx = 5 * vi[m];
686*9371c9d4SSatish Balay       x1  = t[idx];
687*9371c9d4SSatish Balay       x2  = t[1 + idx];
688*9371c9d4SSatish Balay       x3  = t[2 + idx];
689*9371c9d4SSatish Balay       x4  = t[3 + idx];
690*9371c9d4SSatish Balay       x5  = t[4 + idx];
69128e30874SSatish Balay       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
69228e30874SSatish Balay       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
69328e30874SSatish Balay       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
69428e30874SSatish Balay       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
69528e30874SSatish Balay       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
69628e30874SSatish Balay       v += 25;
69728e30874SSatish Balay     }
69828e30874SSatish Balay     idc    = 5 * c[i];
699*9371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
700*9371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
701*9371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
702*9371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
703*9371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
70428e30874SSatish Balay   }
70528e30874SSatish Balay 
7069566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
7079566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
7089566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
7099566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
7109566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
71128e30874SSatish Balay   PetscFunctionReturn(0);
71228e30874SSatish Balay }
71328e30874SSatish Balay 
714*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx) {
71528e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
71628e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
71728e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
71828e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
71928e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
72028e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
72128e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
72228e30874SSatish Balay   const PetscScalar *b;
72328e30874SSatish Balay 
72428e30874SSatish Balay   PetscFunctionBegin;
7259566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
7269566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
72728e30874SSatish Balay   t = a->solve_work;
72828e30874SSatish Balay 
729*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
730*9371c9d4SSatish Balay   r = rout;
731*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
732*9371c9d4SSatish Balay   c = cout + (n - 1);
73328e30874SSatish Balay 
73428e30874SSatish Balay   /* forward solve the lower triangular */
73528e30874SSatish Balay   idx  = 4 * (*r++);
736*9371c9d4SSatish Balay   t[0] = b[idx];
737*9371c9d4SSatish Balay   t[1] = b[1 + idx];
738*9371c9d4SSatish Balay   t[2] = b[2 + idx];
739*9371c9d4SSatish Balay   t[3] = b[3 + idx];
74028e30874SSatish Balay   for (i = 1; i < n; i++) {
74128e30874SSatish Balay     v   = aa + 16 * ai[i];
74228e30874SSatish Balay     vi  = aj + ai[i];
74328e30874SSatish Balay     nz  = diag[i] - ai[i];
74428e30874SSatish Balay     idx = 4 * (*r++);
745*9371c9d4SSatish Balay     s1  = b[idx];
746*9371c9d4SSatish Balay     s2  = b[1 + idx];
747*9371c9d4SSatish Balay     s3  = b[2 + idx];
748*9371c9d4SSatish Balay     s4  = b[3 + idx];
74928e30874SSatish Balay     while (nz--) {
75028e30874SSatish Balay       idx = 4 * (*vi++);
751*9371c9d4SSatish Balay       x1  = t[idx];
752*9371c9d4SSatish Balay       x2  = t[1 + idx];
753*9371c9d4SSatish Balay       x3  = t[2 + idx];
754*9371c9d4SSatish Balay       x4  = t[3 + idx];
75528e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
75628e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
75728e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
75828e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
75928e30874SSatish Balay       v += 16;
76028e30874SSatish Balay     }
76128e30874SSatish Balay     idx        = 4 * i;
762*9371c9d4SSatish Balay     t[idx]     = s1;
763*9371c9d4SSatish Balay     t[1 + idx] = s2;
764*9371c9d4SSatish Balay     t[2 + idx] = s3;
765*9371c9d4SSatish Balay     t[3 + idx] = s4;
76628e30874SSatish Balay   }
76728e30874SSatish Balay   /* backward solve the upper triangular */
76828e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
76928e30874SSatish Balay     v   = aa + 16 * diag[i] + 16;
77028e30874SSatish Balay     vi  = aj + diag[i] + 1;
77128e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
77228e30874SSatish Balay     idt = 4 * i;
773*9371c9d4SSatish Balay     s1  = t[idt];
774*9371c9d4SSatish Balay     s2  = t[1 + idt];
775*9371c9d4SSatish Balay     s3  = t[2 + idt];
776*9371c9d4SSatish Balay     s4  = t[3 + idt];
77728e30874SSatish Balay     while (nz--) {
77828e30874SSatish Balay       idx = 4 * (*vi++);
779*9371c9d4SSatish Balay       x1  = t[idx];
780*9371c9d4SSatish Balay       x2  = t[1 + idx];
781*9371c9d4SSatish Balay       x3  = t[2 + idx];
782*9371c9d4SSatish Balay       x4  = t[3 + idx];
78328e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
78428e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
78528e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
78628e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
78728e30874SSatish Balay       v += 16;
78828e30874SSatish Balay     }
78928e30874SSatish Balay     idc    = 4 * (*c--);
79028e30874SSatish Balay     v      = aa + 16 * diag[i];
79128e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
79228e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
79328e30874SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
79428e30874SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
79528e30874SSatish Balay   }
79628e30874SSatish Balay 
7979566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
7989566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
7999566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
8009566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
8019566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
80228e30874SSatish Balay   PetscFunctionReturn(0);
80328e30874SSatish Balay }
80428e30874SSatish Balay 
805*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx) {
80628e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
80728e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
80828e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
80928e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
81028e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
81128e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
81228e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
81328e30874SSatish Balay   const PetscScalar *b;
81428e30874SSatish Balay 
81528e30874SSatish Balay   PetscFunctionBegin;
8169566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
8179566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
81828e30874SSatish Balay   t = a->solve_work;
81928e30874SSatish Balay 
820*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
821*9371c9d4SSatish Balay   r = rout;
822*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
823*9371c9d4SSatish Balay   c = cout;
82428e30874SSatish Balay 
82528e30874SSatish Balay   /* forward solve the lower triangular */
82628e30874SSatish Balay   idx  = 4 * r[0];
827*9371c9d4SSatish Balay   t[0] = b[idx];
828*9371c9d4SSatish Balay   t[1] = b[1 + idx];
829*9371c9d4SSatish Balay   t[2] = b[2 + idx];
830*9371c9d4SSatish Balay   t[3] = b[3 + idx];
83128e30874SSatish Balay   for (i = 1; i < n; i++) {
83228e30874SSatish Balay     v   = aa + 16 * ai[i];
83328e30874SSatish Balay     vi  = aj + ai[i];
83428e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
83528e30874SSatish Balay     idx = 4 * r[i];
836*9371c9d4SSatish Balay     s1  = b[idx];
837*9371c9d4SSatish Balay     s2  = b[1 + idx];
838*9371c9d4SSatish Balay     s3  = b[2 + idx];
839*9371c9d4SSatish Balay     s4  = b[3 + idx];
84028e30874SSatish Balay     for (m = 0; m < nz; m++) {
84128e30874SSatish Balay       idx = 4 * vi[m];
842*9371c9d4SSatish Balay       x1  = t[idx];
843*9371c9d4SSatish Balay       x2  = t[1 + idx];
844*9371c9d4SSatish Balay       x3  = t[2 + idx];
845*9371c9d4SSatish Balay       x4  = t[3 + idx];
84628e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
84728e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
84828e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
84928e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
85028e30874SSatish Balay       v += 16;
85128e30874SSatish Balay     }
85228e30874SSatish Balay     idx        = 4 * i;
853*9371c9d4SSatish Balay     t[idx]     = s1;
854*9371c9d4SSatish Balay     t[1 + idx] = s2;
855*9371c9d4SSatish Balay     t[2 + idx] = s3;
856*9371c9d4SSatish Balay     t[3 + idx] = s4;
85728e30874SSatish Balay   }
85828e30874SSatish Balay   /* backward solve the upper triangular */
85928e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
86028e30874SSatish Balay     v   = aa + 16 * (adiag[i + 1] + 1);
86128e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
86228e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
86328e30874SSatish Balay     idt = 4 * i;
864*9371c9d4SSatish Balay     s1  = t[idt];
865*9371c9d4SSatish Balay     s2  = t[1 + idt];
866*9371c9d4SSatish Balay     s3  = t[2 + idt];
867*9371c9d4SSatish Balay     s4  = t[3 + idt];
86828e30874SSatish Balay     for (m = 0; m < nz; m++) {
86928e30874SSatish Balay       idx = 4 * vi[m];
870*9371c9d4SSatish Balay       x1  = t[idx];
871*9371c9d4SSatish Balay       x2  = t[1 + idx];
872*9371c9d4SSatish Balay       x3  = t[2 + idx];
873*9371c9d4SSatish Balay       x4  = t[3 + idx];
87428e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
87528e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
87628e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
87728e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
87828e30874SSatish Balay       v += 16;
87928e30874SSatish Balay     }
88028e30874SSatish Balay     idc    = 4 * c[i];
88128e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
88228e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
88328e30874SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
88428e30874SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
88528e30874SSatish Balay   }
88628e30874SSatish Balay 
8879566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
8889566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
8899566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
8909566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
8919566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
89228e30874SSatish Balay   PetscFunctionReturn(0);
89328e30874SSatish Balay }
89428e30874SSatish Balay 
895*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx) {
89628e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
89728e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
89828e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
89928e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
90028e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
90128e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
90228e30874SSatish Balay   MatScalar          s1, s2, s3, s4, x1, x2, x3, x4, *t;
90328e30874SSatish Balay   PetscScalar       *x;
90428e30874SSatish Balay   const PetscScalar *b;
90528e30874SSatish Balay 
90628e30874SSatish Balay   PetscFunctionBegin;
9079566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
9089566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
90928e30874SSatish Balay   t = (MatScalar *)a->solve_work;
91028e30874SSatish Balay 
911*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
912*9371c9d4SSatish Balay   r = rout;
913*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
914*9371c9d4SSatish Balay   c = cout + (n - 1);
91528e30874SSatish Balay 
91628e30874SSatish Balay   /* forward solve the lower triangular */
91728e30874SSatish Balay   idx  = 4 * (*r++);
91828e30874SSatish Balay   t[0] = (MatScalar)b[idx];
91928e30874SSatish Balay   t[1] = (MatScalar)b[1 + idx];
92028e30874SSatish Balay   t[2] = (MatScalar)b[2 + idx];
92128e30874SSatish Balay   t[3] = (MatScalar)b[3 + idx];
92228e30874SSatish Balay   for (i = 1; i < n; i++) {
92328e30874SSatish Balay     v   = aa + 16 * ai[i];
92428e30874SSatish Balay     vi  = aj + ai[i];
92528e30874SSatish Balay     nz  = diag[i] - ai[i];
92628e30874SSatish Balay     idx = 4 * (*r++);
92728e30874SSatish Balay     s1  = (MatScalar)b[idx];
92828e30874SSatish Balay     s2  = (MatScalar)b[1 + idx];
92928e30874SSatish Balay     s3  = (MatScalar)b[2 + idx];
93028e30874SSatish Balay     s4  = (MatScalar)b[3 + idx];
93128e30874SSatish Balay     while (nz--) {
93228e30874SSatish Balay       idx = 4 * (*vi++);
93328e30874SSatish Balay       x1  = t[idx];
93428e30874SSatish Balay       x2  = t[1 + idx];
93528e30874SSatish Balay       x3  = t[2 + idx];
93628e30874SSatish Balay       x4  = t[3 + idx];
93728e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
93828e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
93928e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
94028e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
94128e30874SSatish Balay       v += 16;
94228e30874SSatish Balay     }
94328e30874SSatish Balay     idx        = 4 * i;
94428e30874SSatish Balay     t[idx]     = s1;
94528e30874SSatish Balay     t[1 + idx] = s2;
94628e30874SSatish Balay     t[2 + idx] = s3;
94728e30874SSatish Balay     t[3 + idx] = s4;
94828e30874SSatish Balay   }
94928e30874SSatish Balay   /* backward solve the upper triangular */
95028e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
95128e30874SSatish Balay     v   = aa + 16 * diag[i] + 16;
95228e30874SSatish Balay     vi  = aj + diag[i] + 1;
95328e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
95428e30874SSatish Balay     idt = 4 * i;
95528e30874SSatish Balay     s1  = t[idt];
95628e30874SSatish Balay     s2  = t[1 + idt];
95728e30874SSatish Balay     s3  = t[2 + idt];
95828e30874SSatish Balay     s4  = t[3 + idt];
95928e30874SSatish Balay     while (nz--) {
96028e30874SSatish Balay       idx = 4 * (*vi++);
96128e30874SSatish Balay       x1  = t[idx];
96228e30874SSatish Balay       x2  = t[1 + idx];
96328e30874SSatish Balay       x3  = t[2 + idx];
96428e30874SSatish Balay       x4  = t[3 + idx];
96528e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
96628e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
96728e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
96828e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
96928e30874SSatish Balay       v += 16;
97028e30874SSatish Balay     }
97128e30874SSatish Balay     idc        = 4 * (*c--);
97228e30874SSatish Balay     v          = aa + 16 * diag[i];
97328e30874SSatish Balay     t[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
97428e30874SSatish Balay     t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
97528e30874SSatish Balay     t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
97628e30874SSatish Balay     t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
97728e30874SSatish Balay     x[idc]     = (PetscScalar)t[idt];
97828e30874SSatish Balay     x[1 + idc] = (PetscScalar)t[1 + idt];
97928e30874SSatish Balay     x[2 + idc] = (PetscScalar)t[2 + idt];
98028e30874SSatish Balay     x[3 + idc] = (PetscScalar)t[3 + idt];
98128e30874SSatish Balay   }
98228e30874SSatish Balay 
9839566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
9849566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
9859566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
9869566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
9879566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
98828e30874SSatish Balay   PetscFunctionReturn(0);
98928e30874SSatish Balay }
99028e30874SSatish Balay 
99128e30874SSatish Balay #if defined(PETSC_HAVE_SSE)
99228e30874SSatish Balay 
99328e30874SSatish Balay #include PETSC_HAVE_SSE
99428e30874SSatish Balay 
995*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx) {
99628e30874SSatish Balay   /*
99728e30874SSatish Balay      Note: This code uses demotion of double
99828e30874SSatish Balay      to float when performing the mixed-mode computation.
99928e30874SSatish Balay      This may not be numerically reasonable for all applications.
100028e30874SSatish Balay   */
100128e30874SSatish Balay   Mat_SeqBAIJ    *a     = (Mat_SeqBAIJ *)A->data;
100228e30874SSatish Balay   IS              iscol = a->col, isrow = a->row;
100328e30874SSatish Balay   PetscInt        i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
100428e30874SSatish Balay   const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
100528e30874SSatish Balay   MatScalar      *aa = a->a, *v;
100628e30874SSatish Balay   PetscScalar    *x, *b, *t;
100728e30874SSatish Balay 
100828e30874SSatish Balay   /* Make space in temp stack for 16 Byte Aligned arrays */
100928e30874SSatish Balay   float         ssealignedspace[11], *tmps, *tmpx;
101028e30874SSatish Balay   unsigned long offset;
101128e30874SSatish Balay 
101228e30874SSatish Balay   PetscFunctionBegin;
101328e30874SSatish Balay   SSE_SCOPE_BEGIN;
101428e30874SSatish Balay 
101528e30874SSatish Balay   offset = (unsigned long)ssealignedspace % 16;
101628e30874SSatish Balay   if (offset) offset = (16 - offset) / 4;
101728e30874SSatish Balay   tmps = &ssealignedspace[offset];
101828e30874SSatish Balay   tmpx = &ssealignedspace[offset + 4];
101928e30874SSatish Balay   PREFETCH_NTA(aa + 16 * ai[1]);
102028e30874SSatish Balay 
10219566063dSJacob Faibussowitsch   PetscCall(VecGetArray(bb, &b));
10229566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
102328e30874SSatish Balay   t = a->solve_work;
102428e30874SSatish Balay 
1025*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
1026*9371c9d4SSatish Balay   r = rout;
1027*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
1028*9371c9d4SSatish Balay   c = cout + (n - 1);
102928e30874SSatish Balay 
103028e30874SSatish Balay   /* forward solve the lower triangular */
103128e30874SSatish Balay   idx  = 4 * (*r++);
1032*9371c9d4SSatish Balay   t[0] = b[idx];
1033*9371c9d4SSatish Balay   t[1] = b[1 + idx];
1034*9371c9d4SSatish Balay   t[2] = b[2 + idx];
1035*9371c9d4SSatish Balay   t[3] = b[3 + idx];
103628e30874SSatish Balay   v    = aa + 16 * ai[1];
103728e30874SSatish Balay 
103828e30874SSatish Balay   for (i = 1; i < n;) {
103928e30874SSatish Balay     PREFETCH_NTA(&v[8]);
104028e30874SSatish Balay     vi  = aj + ai[i];
104128e30874SSatish Balay     nz  = diag[i] - ai[i];
104228e30874SSatish Balay     idx = 4 * (*r++);
104328e30874SSatish Balay 
104428e30874SSatish Balay     /* Demote sum from double to float */
104528e30874SSatish Balay     CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
104628e30874SSatish Balay     LOAD_PS(tmps, XMM7);
104728e30874SSatish Balay 
104828e30874SSatish Balay     while (nz--) {
104928e30874SSatish Balay       PREFETCH_NTA(&v[16]);
105028e30874SSatish Balay       idx = 4 * (*vi++);
105128e30874SSatish Balay 
105228e30874SSatish Balay       /* Demote solution (so far) from double to float */
105328e30874SSatish Balay       CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);
105428e30874SSatish Balay 
105528e30874SSatish Balay       /* 4x4 Matrix-Vector product with negative accumulation: */
105628e30874SSatish Balay       SSE_INLINE_BEGIN_2(tmpx, v)
105728e30874SSatish Balay       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
105828e30874SSatish Balay 
105928e30874SSatish Balay       /* First Column */
106028e30874SSatish Balay       SSE_COPY_PS(XMM0, XMM6)
106128e30874SSatish Balay       SSE_SHUFFLE(XMM0, XMM0, 0x00)
106228e30874SSatish Balay       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
106328e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM0)
106428e30874SSatish Balay 
106528e30874SSatish Balay       /* Second Column */
106628e30874SSatish Balay       SSE_COPY_PS(XMM1, XMM6)
106728e30874SSatish Balay       SSE_SHUFFLE(XMM1, XMM1, 0x55)
106828e30874SSatish Balay       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
106928e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM1)
107028e30874SSatish Balay 
107128e30874SSatish Balay       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
107228e30874SSatish Balay 
107328e30874SSatish Balay       /* Third Column */
107428e30874SSatish Balay       SSE_COPY_PS(XMM2, XMM6)
107528e30874SSatish Balay       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
107628e30874SSatish Balay       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
107728e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM2)
107828e30874SSatish Balay 
107928e30874SSatish Balay       /* Fourth Column */
108028e30874SSatish Balay       SSE_COPY_PS(XMM3, XMM6)
108128e30874SSatish Balay       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
108228e30874SSatish Balay       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
108328e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM3)
108428e30874SSatish Balay       SSE_INLINE_END_2
108528e30874SSatish Balay 
108628e30874SSatish Balay       v += 16;
108728e30874SSatish Balay     }
108828e30874SSatish Balay     idx = 4 * i;
108928e30874SSatish Balay     v   = aa + 16 * ai[++i];
109028e30874SSatish Balay     PREFETCH_NTA(v);
109128e30874SSatish Balay     STORE_PS(tmps, XMM7);
109228e30874SSatish Balay 
109328e30874SSatish Balay     /* Promote result from float to double */
109428e30874SSatish Balay     CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
109528e30874SSatish Balay   }
109628e30874SSatish Balay   /* backward solve the upper triangular */
109728e30874SSatish Balay   idt  = 4 * (n - 1);
109828e30874SSatish Balay   ai16 = 16 * diag[n - 1];
109928e30874SSatish Balay   v    = aa + ai16 + 16;
110028e30874SSatish Balay   for (i = n - 1; i >= 0;) {
110128e30874SSatish Balay     PREFETCH_NTA(&v[8]);
110228e30874SSatish Balay     vi = aj + diag[i] + 1;
110328e30874SSatish Balay     nz = ai[i + 1] - diag[i] - 1;
110428e30874SSatish Balay 
110528e30874SSatish Balay     /* Demote accumulator from double to float */
110628e30874SSatish Balay     CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
110728e30874SSatish Balay     LOAD_PS(tmps, XMM7);
110828e30874SSatish Balay 
110928e30874SSatish Balay     while (nz--) {
111028e30874SSatish Balay       PREFETCH_NTA(&v[16]);
111128e30874SSatish Balay       idx = 4 * (*vi++);
111228e30874SSatish Balay 
111328e30874SSatish Balay       /* Demote solution (so far) from double to float */
111428e30874SSatish Balay       CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);
111528e30874SSatish Balay 
111628e30874SSatish Balay       /* 4x4 Matrix-Vector Product with negative accumulation: */
111728e30874SSatish Balay       SSE_INLINE_BEGIN_2(tmpx, v)
111828e30874SSatish Balay       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
111928e30874SSatish Balay 
112028e30874SSatish Balay       /* First Column */
112128e30874SSatish Balay       SSE_COPY_PS(XMM0, XMM6)
112228e30874SSatish Balay       SSE_SHUFFLE(XMM0, XMM0, 0x00)
112328e30874SSatish Balay       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
112428e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM0)
112528e30874SSatish Balay 
112628e30874SSatish Balay       /* Second Column */
112728e30874SSatish Balay       SSE_COPY_PS(XMM1, XMM6)
112828e30874SSatish Balay       SSE_SHUFFLE(XMM1, XMM1, 0x55)
112928e30874SSatish Balay       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
113028e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM1)
113128e30874SSatish Balay 
113228e30874SSatish Balay       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
113328e30874SSatish Balay 
113428e30874SSatish Balay       /* Third Column */
113528e30874SSatish Balay       SSE_COPY_PS(XMM2, XMM6)
113628e30874SSatish Balay       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
113728e30874SSatish Balay       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
113828e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM2)
113928e30874SSatish Balay 
114028e30874SSatish Balay       /* Fourth Column */
114128e30874SSatish Balay       SSE_COPY_PS(XMM3, XMM6)
114228e30874SSatish Balay       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
114328e30874SSatish Balay       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
114428e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM3)
114528e30874SSatish Balay       SSE_INLINE_END_2
114628e30874SSatish Balay       v += 16;
114728e30874SSatish Balay     }
114828e30874SSatish Balay     v    = aa + ai16;
114928e30874SSatish Balay     ai16 = 16 * diag[--i];
115028e30874SSatish Balay     PREFETCH_NTA(aa + ai16 + 16);
115128e30874SSatish Balay     /*
115228e30874SSatish Balay        Scale the result by the diagonal 4x4 block,
115328e30874SSatish Balay        which was inverted as part of the factorization
115428e30874SSatish Balay     */
115528e30874SSatish Balay     SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
115628e30874SSatish Balay     /* First Column */
115728e30874SSatish Balay     SSE_COPY_PS(XMM0, XMM7)
115828e30874SSatish Balay     SSE_SHUFFLE(XMM0, XMM0, 0x00)
115928e30874SSatish Balay     SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
116028e30874SSatish Balay 
116128e30874SSatish Balay     /* Second Column */
116228e30874SSatish Balay     SSE_COPY_PS(XMM1, XMM7)
116328e30874SSatish Balay     SSE_SHUFFLE(XMM1, XMM1, 0x55)
116428e30874SSatish Balay     SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
116528e30874SSatish Balay     SSE_ADD_PS(XMM0, XMM1)
116628e30874SSatish Balay 
116728e30874SSatish Balay     SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
116828e30874SSatish Balay 
116928e30874SSatish Balay     /* Third Column */
117028e30874SSatish Balay     SSE_COPY_PS(XMM2, XMM7)
117128e30874SSatish Balay     SSE_SHUFFLE(XMM2, XMM2, 0xAA)
117228e30874SSatish Balay     SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
117328e30874SSatish Balay     SSE_ADD_PS(XMM0, XMM2)
117428e30874SSatish Balay 
117528e30874SSatish Balay     /* Fourth Column */
117628e30874SSatish Balay     SSE_COPY_PS(XMM3, XMM7)
117728e30874SSatish Balay     SSE_SHUFFLE(XMM3, XMM3, 0xFF)
117828e30874SSatish Balay     SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
117928e30874SSatish Balay     SSE_ADD_PS(XMM0, XMM3)
118028e30874SSatish Balay 
118128e30874SSatish Balay     SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
118228e30874SSatish Balay     SSE_INLINE_END_3
118328e30874SSatish Balay 
118428e30874SSatish Balay     /* Promote solution from float to double */
118528e30874SSatish Balay     CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);
118628e30874SSatish Balay 
118728e30874SSatish Balay     /* Apply reordering to t and stream into x.    */
118828e30874SSatish Balay     /* This way, x doesn't pollute the cache.      */
118928e30874SSatish Balay     /* Be careful with size: 2 doubles = 4 floats! */
119028e30874SSatish Balay     idc = 4 * (*c--);
119128e30874SSatish Balay     SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
119228e30874SSatish Balay     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
119328e30874SSatish Balay     SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
119428e30874SSatish Balay     SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
119528e30874SSatish Balay     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
119628e30874SSatish Balay     SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
119728e30874SSatish Balay     SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
119828e30874SSatish Balay     SSE_INLINE_END_2
119928e30874SSatish Balay     v = aa + ai16 + 16;
120028e30874SSatish Balay     idt -= 4;
120128e30874SSatish Balay   }
120228e30874SSatish Balay 
12039566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
12049566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
12059566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(bb, &b));
12069566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
12079566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
120828e30874SSatish Balay   SSE_SCOPE_END;
120928e30874SSatish Balay   PetscFunctionReturn(0);
121028e30874SSatish Balay }
121128e30874SSatish Balay 
121228e30874SSatish Balay #endif
121328e30874SSatish Balay 
1214*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx) {
121528e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
121628e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
121728e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
121828e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
121928e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
122028e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
122128e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
122228e30874SSatish Balay   const PetscScalar *b;
122328e30874SSatish Balay 
122428e30874SSatish Balay   PetscFunctionBegin;
12259566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
12269566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
122728e30874SSatish Balay   t = a->solve_work;
122828e30874SSatish Balay 
1229*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
1230*9371c9d4SSatish Balay   r = rout;
1231*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
1232*9371c9d4SSatish Balay   c = cout + (n - 1);
123328e30874SSatish Balay 
123428e30874SSatish Balay   /* forward solve the lower triangular */
123528e30874SSatish Balay   idx  = 3 * (*r++);
1236*9371c9d4SSatish Balay   t[0] = b[idx];
1237*9371c9d4SSatish Balay   t[1] = b[1 + idx];
1238*9371c9d4SSatish Balay   t[2] = b[2 + idx];
123928e30874SSatish Balay   for (i = 1; i < n; i++) {
124028e30874SSatish Balay     v   = aa + 9 * ai[i];
124128e30874SSatish Balay     vi  = aj + ai[i];
124228e30874SSatish Balay     nz  = diag[i] - ai[i];
124328e30874SSatish Balay     idx = 3 * (*r++);
1244*9371c9d4SSatish Balay     s1  = b[idx];
1245*9371c9d4SSatish Balay     s2  = b[1 + idx];
1246*9371c9d4SSatish Balay     s3  = b[2 + idx];
124728e30874SSatish Balay     while (nz--) {
124828e30874SSatish Balay       idx = 3 * (*vi++);
1249*9371c9d4SSatish Balay       x1  = t[idx];
1250*9371c9d4SSatish Balay       x2  = t[1 + idx];
1251*9371c9d4SSatish Balay       x3  = t[2 + idx];
125228e30874SSatish Balay       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
125328e30874SSatish Balay       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
125428e30874SSatish Balay       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
125528e30874SSatish Balay       v += 9;
125628e30874SSatish Balay     }
125728e30874SSatish Balay     idx        = 3 * i;
1258*9371c9d4SSatish Balay     t[idx]     = s1;
1259*9371c9d4SSatish Balay     t[1 + idx] = s2;
1260*9371c9d4SSatish Balay     t[2 + idx] = s3;
126128e30874SSatish Balay   }
126228e30874SSatish Balay   /* backward solve the upper triangular */
126328e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
126428e30874SSatish Balay     v   = aa + 9 * diag[i] + 9;
126528e30874SSatish Balay     vi  = aj + diag[i] + 1;
126628e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
126728e30874SSatish Balay     idt = 3 * i;
1268*9371c9d4SSatish Balay     s1  = t[idt];
1269*9371c9d4SSatish Balay     s2  = t[1 + idt];
1270*9371c9d4SSatish Balay     s3  = t[2 + idt];
127128e30874SSatish Balay     while (nz--) {
127228e30874SSatish Balay       idx = 3 * (*vi++);
1273*9371c9d4SSatish Balay       x1  = t[idx];
1274*9371c9d4SSatish Balay       x2  = t[1 + idx];
1275*9371c9d4SSatish Balay       x3  = t[2 + idx];
127628e30874SSatish Balay       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
127728e30874SSatish Balay       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
127828e30874SSatish Balay       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
127928e30874SSatish Balay       v += 9;
128028e30874SSatish Balay     }
128128e30874SSatish Balay     idc    = 3 * (*c--);
128228e30874SSatish Balay     v      = aa + 9 * diag[i];
128328e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
128428e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
128528e30874SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
128628e30874SSatish Balay   }
12879566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
12889566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
12899566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
12909566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
12919566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
129228e30874SSatish Balay   PetscFunctionReturn(0);
129328e30874SSatish Balay }
129428e30874SSatish Balay 
1295*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx) {
129628e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
129728e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
129828e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
129928e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
130028e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
130128e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
130228e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
130328e30874SSatish Balay   const PetscScalar *b;
130428e30874SSatish Balay 
130528e30874SSatish Balay   PetscFunctionBegin;
13069566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
13079566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
130828e30874SSatish Balay   t = a->solve_work;
130928e30874SSatish Balay 
1310*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
1311*9371c9d4SSatish Balay   r = rout;
1312*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
1313*9371c9d4SSatish Balay   c = cout;
131428e30874SSatish Balay 
131528e30874SSatish Balay   /* forward solve the lower triangular */
131628e30874SSatish Balay   idx  = 3 * r[0];
1317*9371c9d4SSatish Balay   t[0] = b[idx];
1318*9371c9d4SSatish Balay   t[1] = b[1 + idx];
1319*9371c9d4SSatish Balay   t[2] = b[2 + idx];
132028e30874SSatish Balay   for (i = 1; i < n; i++) {
132128e30874SSatish Balay     v   = aa + 9 * ai[i];
132228e30874SSatish Balay     vi  = aj + ai[i];
132328e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
132428e30874SSatish Balay     idx = 3 * r[i];
1325*9371c9d4SSatish Balay     s1  = b[idx];
1326*9371c9d4SSatish Balay     s2  = b[1 + idx];
1327*9371c9d4SSatish Balay     s3  = b[2 + idx];
132828e30874SSatish Balay     for (m = 0; m < nz; m++) {
132928e30874SSatish Balay       idx = 3 * vi[m];
1330*9371c9d4SSatish Balay       x1  = t[idx];
1331*9371c9d4SSatish Balay       x2  = t[1 + idx];
1332*9371c9d4SSatish Balay       x3  = t[2 + idx];
133328e30874SSatish Balay       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
133428e30874SSatish Balay       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
133528e30874SSatish Balay       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
133628e30874SSatish Balay       v += 9;
133728e30874SSatish Balay     }
133828e30874SSatish Balay     idx        = 3 * i;
1339*9371c9d4SSatish Balay     t[idx]     = s1;
1340*9371c9d4SSatish Balay     t[1 + idx] = s2;
1341*9371c9d4SSatish Balay     t[2 + idx] = s3;
134228e30874SSatish Balay   }
134328e30874SSatish Balay   /* backward solve the upper triangular */
134428e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
134528e30874SSatish Balay     v   = aa + 9 * (adiag[i + 1] + 1);
134628e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
134728e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
134828e30874SSatish Balay     idt = 3 * i;
1349*9371c9d4SSatish Balay     s1  = t[idt];
1350*9371c9d4SSatish Balay     s2  = t[1 + idt];
1351*9371c9d4SSatish Balay     s3  = t[2 + idt];
135228e30874SSatish Balay     for (m = 0; m < nz; m++) {
135328e30874SSatish Balay       idx = 3 * vi[m];
1354*9371c9d4SSatish Balay       x1  = t[idx];
1355*9371c9d4SSatish Balay       x2  = t[1 + idx];
1356*9371c9d4SSatish Balay       x3  = t[2 + idx];
135728e30874SSatish Balay       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
135828e30874SSatish Balay       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
135928e30874SSatish Balay       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
136028e30874SSatish Balay       v += 9;
136128e30874SSatish Balay     }
136228e30874SSatish Balay     idc    = 3 * c[i];
136328e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
136428e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
136528e30874SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
136628e30874SSatish Balay   }
13679566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
13689566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
13699566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
13709566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
13719566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
137228e30874SSatish Balay   PetscFunctionReturn(0);
137328e30874SSatish Balay }
137428e30874SSatish Balay 
1375*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx) {
137628e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
137728e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
137828e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
137928e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
138028e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
138128e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
138228e30874SSatish Balay   PetscScalar       *x, s1, s2, x1, x2, *t;
138328e30874SSatish Balay   const PetscScalar *b;
138428e30874SSatish Balay 
138528e30874SSatish Balay   PetscFunctionBegin;
13869566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
13879566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
138828e30874SSatish Balay   t = a->solve_work;
138928e30874SSatish Balay 
1390*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
1391*9371c9d4SSatish Balay   r = rout;
1392*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
1393*9371c9d4SSatish Balay   c = cout + (n - 1);
139428e30874SSatish Balay 
139528e30874SSatish Balay   /* forward solve the lower triangular */
139628e30874SSatish Balay   idx  = 2 * (*r++);
1397*9371c9d4SSatish Balay   t[0] = b[idx];
1398*9371c9d4SSatish Balay   t[1] = b[1 + idx];
139928e30874SSatish Balay   for (i = 1; i < n; i++) {
140028e30874SSatish Balay     v   = aa + 4 * ai[i];
140128e30874SSatish Balay     vi  = aj + ai[i];
140228e30874SSatish Balay     nz  = diag[i] - ai[i];
140328e30874SSatish Balay     idx = 2 * (*r++);
1404*9371c9d4SSatish Balay     s1  = b[idx];
1405*9371c9d4SSatish Balay     s2  = b[1 + idx];
140628e30874SSatish Balay     while (nz--) {
140728e30874SSatish Balay       idx = 2 * (*vi++);
1408*9371c9d4SSatish Balay       x1  = t[idx];
1409*9371c9d4SSatish Balay       x2  = t[1 + idx];
141028e30874SSatish Balay       s1 -= v[0] * x1 + v[2] * x2;
141128e30874SSatish Balay       s2 -= v[1] * x1 + v[3] * x2;
141228e30874SSatish Balay       v += 4;
141328e30874SSatish Balay     }
141428e30874SSatish Balay     idx        = 2 * i;
1415*9371c9d4SSatish Balay     t[idx]     = s1;
1416*9371c9d4SSatish Balay     t[1 + idx] = s2;
141728e30874SSatish Balay   }
141828e30874SSatish Balay   /* backward solve the upper triangular */
141928e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
142028e30874SSatish Balay     v   = aa + 4 * diag[i] + 4;
142128e30874SSatish Balay     vi  = aj + diag[i] + 1;
142228e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
142328e30874SSatish Balay     idt = 2 * i;
1424*9371c9d4SSatish Balay     s1  = t[idt];
1425*9371c9d4SSatish Balay     s2  = t[1 + idt];
142628e30874SSatish Balay     while (nz--) {
142728e30874SSatish Balay       idx = 2 * (*vi++);
1428*9371c9d4SSatish Balay       x1  = t[idx];
1429*9371c9d4SSatish Balay       x2  = t[1 + idx];
143028e30874SSatish Balay       s1 -= v[0] * x1 + v[2] * x2;
143128e30874SSatish Balay       s2 -= v[1] * x1 + v[3] * x2;
143228e30874SSatish Balay       v += 4;
143328e30874SSatish Balay     }
143428e30874SSatish Balay     idc    = 2 * (*c--);
143528e30874SSatish Balay     v      = aa + 4 * diag[i];
143628e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
143728e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
143828e30874SSatish Balay   }
14399566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
14409566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
14419566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
14429566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
14439566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
144428e30874SSatish Balay   PetscFunctionReturn(0);
144528e30874SSatish Balay }
144628e30874SSatish Balay 
1447*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx) {
144828e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
144928e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
145028e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
145128e30874SSatish Balay   PetscInt           i, nz, idx, jdx, idt, idc, m;
145228e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
145328e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
145428e30874SSatish Balay   PetscScalar       *x, s1, s2, x1, x2, *t;
145528e30874SSatish Balay   const PetscScalar *b;
145628e30874SSatish Balay 
145728e30874SSatish Balay   PetscFunctionBegin;
14589566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
14599566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
146028e30874SSatish Balay   t = a->solve_work;
146128e30874SSatish Balay 
1462*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
1463*9371c9d4SSatish Balay   r = rout;
1464*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
1465*9371c9d4SSatish Balay   c = cout;
146628e30874SSatish Balay 
146728e30874SSatish Balay   /* forward solve the lower triangular */
146828e30874SSatish Balay   idx  = 2 * r[0];
1469*9371c9d4SSatish Balay   t[0] = b[idx];
1470*9371c9d4SSatish Balay   t[1] = b[1 + idx];
147128e30874SSatish Balay   for (i = 1; i < n; i++) {
147228e30874SSatish Balay     v   = aa + 4 * ai[i];
147328e30874SSatish Balay     vi  = aj + ai[i];
147428e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
147528e30874SSatish Balay     idx = 2 * r[i];
1476*9371c9d4SSatish Balay     s1  = b[idx];
1477*9371c9d4SSatish Balay     s2  = b[1 + idx];
147828e30874SSatish Balay     for (m = 0; m < nz; m++) {
147928e30874SSatish Balay       jdx = 2 * vi[m];
1480*9371c9d4SSatish Balay       x1  = t[jdx];
1481*9371c9d4SSatish Balay       x2  = t[1 + jdx];
148228e30874SSatish Balay       s1 -= v[0] * x1 + v[2] * x2;
148328e30874SSatish Balay       s2 -= v[1] * x1 + v[3] * x2;
148428e30874SSatish Balay       v += 4;
148528e30874SSatish Balay     }
148628e30874SSatish Balay     idx        = 2 * i;
1487*9371c9d4SSatish Balay     t[idx]     = s1;
1488*9371c9d4SSatish Balay     t[1 + idx] = s2;
148928e30874SSatish Balay   }
149028e30874SSatish Balay   /* backward solve the upper triangular */
149128e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
149228e30874SSatish Balay     v   = aa + 4 * (adiag[i + 1] + 1);
149328e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
149428e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
149528e30874SSatish Balay     idt = 2 * i;
1496*9371c9d4SSatish Balay     s1  = t[idt];
1497*9371c9d4SSatish Balay     s2  = t[1 + idt];
149828e30874SSatish Balay     for (m = 0; m < nz; m++) {
149928e30874SSatish Balay       idx = 2 * vi[m];
1500*9371c9d4SSatish Balay       x1  = t[idx];
1501*9371c9d4SSatish Balay       x2  = t[1 + idx];
150228e30874SSatish Balay       s1 -= v[0] * x1 + v[2] * x2;
150328e30874SSatish Balay       s2 -= v[1] * x1 + v[3] * x2;
150428e30874SSatish Balay       v += 4;
150528e30874SSatish Balay     }
150628e30874SSatish Balay     idc    = 2 * c[i];
150728e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
150828e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
150928e30874SSatish Balay   }
15109566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
15119566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
15129566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
15139566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
15149566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
151528e30874SSatish Balay   PetscFunctionReturn(0);
151628e30874SSatish Balay }
151728e30874SSatish Balay 
1518*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx) {
151928e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
152028e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
152128e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
152228e30874SSatish Balay   PetscInt           i, nz;
152328e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
152428e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
152528e30874SSatish Balay   PetscScalar       *x, s1, *t;
152628e30874SSatish Balay   const PetscScalar *b;
152728e30874SSatish Balay 
152828e30874SSatish Balay   PetscFunctionBegin;
152928e30874SSatish Balay   if (!n) PetscFunctionReturn(0);
153028e30874SSatish Balay 
15319566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
15329566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
153328e30874SSatish Balay   t = a->solve_work;
153428e30874SSatish Balay 
1535*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
1536*9371c9d4SSatish Balay   r = rout;
1537*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
1538*9371c9d4SSatish Balay   c = cout + (n - 1);
153928e30874SSatish Balay 
154028e30874SSatish Balay   /* forward solve the lower triangular */
154128e30874SSatish Balay   t[0] = b[*r++];
154228e30874SSatish Balay   for (i = 1; i < n; i++) {
154328e30874SSatish Balay     v  = aa + ai[i];
154428e30874SSatish Balay     vi = aj + ai[i];
154528e30874SSatish Balay     nz = diag[i] - ai[i];
154628e30874SSatish Balay     s1 = b[*r++];
1547*9371c9d4SSatish Balay     while (nz--) { s1 -= (*v++) * t[*vi++]; }
154828e30874SSatish Balay     t[i] = s1;
154928e30874SSatish Balay   }
155028e30874SSatish Balay   /* backward solve the upper triangular */
155128e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
155228e30874SSatish Balay     v  = aa + diag[i] + 1;
155328e30874SSatish Balay     vi = aj + diag[i] + 1;
155428e30874SSatish Balay     nz = ai[i + 1] - diag[i] - 1;
155528e30874SSatish Balay     s1 = t[i];
1556*9371c9d4SSatish Balay     while (nz--) { s1 -= (*v++) * t[*vi++]; }
155728e30874SSatish Balay     x[*c--] = t[i] = aa[diag[i]] * s1;
155828e30874SSatish Balay   }
155928e30874SSatish Balay 
15609566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
15619566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
15629566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
15639566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
15649566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
156528e30874SSatish Balay   PetscFunctionReturn(0);
156628e30874SSatish Balay }
156728e30874SSatish Balay 
1568*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx) {
156928e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
157028e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
157128e30874SSatish Balay   PetscInt           i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
157228e30874SSatish Balay   const PetscInt    *rout, *cout, *r, *c;
157328e30874SSatish Balay   PetscScalar       *x, *tmp, sum;
157428e30874SSatish Balay   const PetscScalar *b;
157528e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
157628e30874SSatish Balay 
157728e30874SSatish Balay   PetscFunctionBegin;
157828e30874SSatish Balay   if (!n) PetscFunctionReturn(0);
157928e30874SSatish Balay 
15809566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
15819566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
158228e30874SSatish Balay   tmp = a->solve_work;
158328e30874SSatish Balay 
1584*9371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
1585*9371c9d4SSatish Balay   r = rout;
1586*9371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
1587*9371c9d4SSatish Balay   c = cout;
158828e30874SSatish Balay 
158928e30874SSatish Balay   /* forward solve the lower triangular */
159028e30874SSatish Balay   tmp[0] = b[r[0]];
159128e30874SSatish Balay   v      = aa;
159228e30874SSatish Balay   vi     = aj;
159328e30874SSatish Balay   for (i = 1; i < n; i++) {
159428e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
159528e30874SSatish Balay     sum = b[r[i]];
159628e30874SSatish Balay     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
159728e30874SSatish Balay     tmp[i] = sum;
1598*9371c9d4SSatish Balay     v += nz;
1599*9371c9d4SSatish Balay     vi += nz;
160028e30874SSatish Balay   }
160128e30874SSatish Balay 
160228e30874SSatish Balay   /* backward solve the upper triangular */
160328e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
160428e30874SSatish Balay     v   = aa + adiag[i + 1] + 1;
160528e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
160628e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
160728e30874SSatish Balay     sum = tmp[i];
160828e30874SSatish Balay     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
160928e30874SSatish Balay     x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
161028e30874SSatish Balay   }
161128e30874SSatish Balay 
16129566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
16139566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
16149566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
16159566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
16169566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
161728e30874SSatish Balay   PetscFunctionReturn(0);
161828e30874SSatish Balay }
1619