xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision ce6b3536d5fd46c99c6df19dc764ec34b4d2762c)
128e30874SSatish Balay #include <../src/mat/impls/baij/seq/baij.h>
2af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
328e30874SSatish Balay 
4d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
5d71ae5a4SJacob Faibussowitsch {
628e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
728e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
828e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
928e30874SSatish Balay   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi;
1028e30874SSatish Balay   PetscInt           i, nz;
1128e30874SSatish Balay   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
1228e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
1328e30874SSatish Balay   PetscScalar       *x, *s, *t, *ls;
1428e30874SSatish Balay   const PetscScalar *b;
1528e30874SSatish Balay 
1628e30874SSatish Balay   PetscFunctionBegin;
17*ce6b3536SJed Brown   PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs);
189566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
199566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
2028e30874SSatish Balay   t = a->solve_work;
2128e30874SSatish Balay 
229371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
239371c9d4SSatish Balay   r = rout;
249371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
259371c9d4SSatish Balay   c = cout + (n - 1);
2628e30874SSatish Balay 
2728e30874SSatish Balay   /* forward solve the lower triangular */
289566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
2928e30874SSatish Balay   for (i = 1; i < n; i++) {
3028e30874SSatish Balay     v  = aa + bs2 * ai[i];
3128e30874SSatish Balay     vi = aj + ai[i];
3228e30874SSatish Balay     nz = a->diag[i] - ai[i];
3328e30874SSatish Balay     s  = t + bs * i;
349566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
3528e30874SSatish Balay     while (nz--) {
3628e30874SSatish Balay       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
3728e30874SSatish Balay       v += bs2;
3828e30874SSatish Balay     }
3928e30874SSatish Balay   }
4028e30874SSatish Balay   /* backward solve the upper triangular */
4128e30874SSatish Balay   ls = a->solve_work + A->cmap->n;
4228e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
4328e30874SSatish Balay     v  = aa + bs2 * (a->diag[i] + 1);
4428e30874SSatish Balay     vi = aj + a->diag[i] + 1;
4528e30874SSatish Balay     nz = ai[i + 1] - a->diag[i] - 1;
469566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
4728e30874SSatish Balay     while (nz--) {
4828e30874SSatish Balay       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
4928e30874SSatish Balay       v += bs2;
5028e30874SSatish Balay     }
5128e30874SSatish Balay     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
529566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
5328e30874SSatish Balay   }
5428e30874SSatish Balay 
559566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
569566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
579566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
589566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
599566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
603ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6128e30874SSatish Balay }
6228e30874SSatish Balay 
63d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
64d71ae5a4SJacob Faibussowitsch {
6528e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
6628e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
6728e30874SSatish Balay   const PetscInt    *r, *c, *ai = a->i, *aj = a->j;
6828e30874SSatish Balay   const PetscInt    *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
6928e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
7028e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
7128e30874SSatish Balay   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
7228e30874SSatish Balay   const PetscScalar *b;
7328e30874SSatish Balay 
7428e30874SSatish Balay   PetscFunctionBegin;
759566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
769566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
7728e30874SSatish Balay   t = a->solve_work;
7828e30874SSatish Balay 
799371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
809371c9d4SSatish Balay   r = rout;
819371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
829371c9d4SSatish Balay   c = cout + (n - 1);
8328e30874SSatish Balay 
8428e30874SSatish Balay   /* forward solve the lower triangular */
8528e30874SSatish Balay   idx  = 7 * (*r++);
869371c9d4SSatish Balay   t[0] = b[idx];
879371c9d4SSatish Balay   t[1] = b[1 + idx];
889371c9d4SSatish Balay   t[2] = b[2 + idx];
899371c9d4SSatish Balay   t[3] = b[3 + idx];
909371c9d4SSatish Balay   t[4] = b[4 + idx];
919371c9d4SSatish Balay   t[5] = b[5 + idx];
929371c9d4SSatish Balay   t[6] = b[6 + idx];
9328e30874SSatish Balay 
9428e30874SSatish Balay   for (i = 1; i < n; i++) {
9528e30874SSatish Balay     v   = aa + 49 * ai[i];
9628e30874SSatish Balay     vi  = aj + ai[i];
9728e30874SSatish Balay     nz  = diag[i] - ai[i];
9828e30874SSatish Balay     idx = 7 * (*r++);
999371c9d4SSatish Balay     s1  = b[idx];
1009371c9d4SSatish Balay     s2  = b[1 + idx];
1019371c9d4SSatish Balay     s3  = b[2 + idx];
1029371c9d4SSatish Balay     s4  = b[3 + idx];
1039371c9d4SSatish Balay     s5  = b[4 + idx];
1049371c9d4SSatish Balay     s6  = b[5 + idx];
1059371c9d4SSatish Balay     s7  = b[6 + idx];
10628e30874SSatish Balay     while (nz--) {
10728e30874SSatish Balay       idx = 7 * (*vi++);
1089371c9d4SSatish Balay       x1  = t[idx];
1099371c9d4SSatish Balay       x2  = t[1 + idx];
1109371c9d4SSatish Balay       x3  = t[2 + idx];
1119371c9d4SSatish Balay       x4  = t[3 + idx];
1129371c9d4SSatish Balay       x5  = t[4 + idx];
1139371c9d4SSatish Balay       x6  = t[5 + idx];
1149371c9d4SSatish Balay       x7  = t[6 + idx];
11528e30874SSatish Balay       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
11628e30874SSatish Balay       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
11728e30874SSatish Balay       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
11828e30874SSatish Balay       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
11928e30874SSatish Balay       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
12028e30874SSatish Balay       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
12128e30874SSatish Balay       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
12228e30874SSatish Balay       v += 49;
12328e30874SSatish Balay     }
12428e30874SSatish Balay     idx        = 7 * i;
1259371c9d4SSatish Balay     t[idx]     = s1;
1269371c9d4SSatish Balay     t[1 + idx] = s2;
1279371c9d4SSatish Balay     t[2 + idx] = s3;
1289371c9d4SSatish Balay     t[3 + idx] = s4;
1299371c9d4SSatish Balay     t[4 + idx] = s5;
1309371c9d4SSatish Balay     t[5 + idx] = s6;
1319371c9d4SSatish Balay     t[6 + idx] = s7;
13228e30874SSatish Balay   }
13328e30874SSatish Balay   /* backward solve the upper triangular */
13428e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
13528e30874SSatish Balay     v   = aa + 49 * diag[i] + 49;
13628e30874SSatish Balay     vi  = aj + diag[i] + 1;
13728e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
13828e30874SSatish Balay     idt = 7 * i;
1399371c9d4SSatish Balay     s1  = t[idt];
1409371c9d4SSatish Balay     s2  = t[1 + idt];
1419371c9d4SSatish Balay     s3  = t[2 + idt];
1429371c9d4SSatish Balay     s4  = t[3 + idt];
1439371c9d4SSatish Balay     s5  = t[4 + idt];
1449371c9d4SSatish Balay     s6  = t[5 + idt];
1459371c9d4SSatish Balay     s7  = t[6 + idt];
14628e30874SSatish Balay     while (nz--) {
14728e30874SSatish Balay       idx = 7 * (*vi++);
1489371c9d4SSatish Balay       x1  = t[idx];
1499371c9d4SSatish Balay       x2  = t[1 + idx];
1509371c9d4SSatish Balay       x3  = t[2 + idx];
1519371c9d4SSatish Balay       x4  = t[3 + idx];
1529371c9d4SSatish Balay       x5  = t[4 + idx];
1539371c9d4SSatish Balay       x6  = t[5 + idx];
1549371c9d4SSatish Balay       x7  = t[6 + idx];
15528e30874SSatish Balay       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
15628e30874SSatish Balay       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
15728e30874SSatish Balay       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
15828e30874SSatish Balay       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
15928e30874SSatish Balay       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
16028e30874SSatish Balay       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
16128e30874SSatish Balay       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
16228e30874SSatish Balay       v += 49;
16328e30874SSatish Balay     }
16428e30874SSatish Balay     idc    = 7 * (*c--);
16528e30874SSatish Balay     v      = aa + 49 * diag[i];
1669371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
1679371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
1689371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
1699371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
1709371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
1719371c9d4SSatish Balay     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
1729371c9d4SSatish Balay     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
17328e30874SSatish Balay   }
17428e30874SSatish Balay 
1759566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
1769566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
1779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
1789566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
1799566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
1803ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18128e30874SSatish Balay }
18228e30874SSatish Balay 
183d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
184d71ae5a4SJacob Faibussowitsch {
18528e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
18628e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
18728e30874SSatish Balay   const PetscInt    *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
18828e30874SSatish Balay   const PetscInt     n = a->mbs, *rout, *cout, *vi;
18928e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
19028e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
19128e30874SSatish Balay   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
19228e30874SSatish Balay   const PetscScalar *b;
19328e30874SSatish Balay 
19428e30874SSatish Balay   PetscFunctionBegin;
1959566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
1969566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
19728e30874SSatish Balay   t = a->solve_work;
19828e30874SSatish Balay 
1999371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
2009371c9d4SSatish Balay   r = rout;
2019371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
2029371c9d4SSatish Balay   c = cout;
20328e30874SSatish Balay 
20428e30874SSatish Balay   /* forward solve the lower triangular */
20528e30874SSatish Balay   idx  = 7 * r[0];
2069371c9d4SSatish Balay   t[0] = b[idx];
2079371c9d4SSatish Balay   t[1] = b[1 + idx];
2089371c9d4SSatish Balay   t[2] = b[2 + idx];
2099371c9d4SSatish Balay   t[3] = b[3 + idx];
2109371c9d4SSatish Balay   t[4] = b[4 + idx];
2119371c9d4SSatish Balay   t[5] = b[5 + idx];
2129371c9d4SSatish Balay   t[6] = b[6 + idx];
21328e30874SSatish Balay 
21428e30874SSatish Balay   for (i = 1; i < n; i++) {
21528e30874SSatish Balay     v   = aa + 49 * ai[i];
21628e30874SSatish Balay     vi  = aj + ai[i];
21728e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
21828e30874SSatish Balay     idx = 7 * r[i];
2199371c9d4SSatish Balay     s1  = b[idx];
2209371c9d4SSatish Balay     s2  = b[1 + idx];
2219371c9d4SSatish Balay     s3  = b[2 + idx];
2229371c9d4SSatish Balay     s4  = b[3 + idx];
2239371c9d4SSatish Balay     s5  = b[4 + idx];
2249371c9d4SSatish Balay     s6  = b[5 + idx];
2259371c9d4SSatish Balay     s7  = b[6 + idx];
22628e30874SSatish Balay     for (m = 0; m < nz; m++) {
22728e30874SSatish Balay       idx = 7 * vi[m];
2289371c9d4SSatish Balay       x1  = t[idx];
2299371c9d4SSatish Balay       x2  = t[1 + idx];
2309371c9d4SSatish Balay       x3  = t[2 + idx];
2319371c9d4SSatish Balay       x4  = t[3 + idx];
2329371c9d4SSatish Balay       x5  = t[4 + idx];
2339371c9d4SSatish Balay       x6  = t[5 + idx];
2349371c9d4SSatish Balay       x7  = t[6 + idx];
23528e30874SSatish Balay       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
23628e30874SSatish Balay       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
23728e30874SSatish Balay       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
23828e30874SSatish Balay       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
23928e30874SSatish Balay       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
24028e30874SSatish Balay       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
24128e30874SSatish Balay       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
24228e30874SSatish Balay       v += 49;
24328e30874SSatish Balay     }
24428e30874SSatish Balay     idx        = 7 * i;
2459371c9d4SSatish Balay     t[idx]     = s1;
2469371c9d4SSatish Balay     t[1 + idx] = s2;
2479371c9d4SSatish Balay     t[2 + idx] = s3;
2489371c9d4SSatish Balay     t[3 + idx] = s4;
2499371c9d4SSatish Balay     t[4 + idx] = s5;
2509371c9d4SSatish Balay     t[5 + idx] = s6;
2519371c9d4SSatish Balay     t[6 + idx] = s7;
25228e30874SSatish Balay   }
25328e30874SSatish Balay   /* backward solve the upper triangular */
25428e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
25528e30874SSatish Balay     v   = aa + 49 * (adiag[i + 1] + 1);
25628e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
25728e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
25828e30874SSatish Balay     idt = 7 * i;
2599371c9d4SSatish Balay     s1  = t[idt];
2609371c9d4SSatish Balay     s2  = t[1 + idt];
2619371c9d4SSatish Balay     s3  = t[2 + idt];
2629371c9d4SSatish Balay     s4  = t[3 + idt];
2639371c9d4SSatish Balay     s5  = t[4 + idt];
2649371c9d4SSatish Balay     s6  = t[5 + idt];
2659371c9d4SSatish Balay     s7  = t[6 + idt];
26628e30874SSatish Balay     for (m = 0; m < nz; m++) {
26728e30874SSatish Balay       idx = 7 * vi[m];
2689371c9d4SSatish Balay       x1  = t[idx];
2699371c9d4SSatish Balay       x2  = t[1 + idx];
2709371c9d4SSatish Balay       x3  = t[2 + idx];
2719371c9d4SSatish Balay       x4  = t[3 + idx];
2729371c9d4SSatish Balay       x5  = t[4 + idx];
2739371c9d4SSatish Balay       x6  = t[5 + idx];
2749371c9d4SSatish Balay       x7  = t[6 + idx];
27528e30874SSatish Balay       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
27628e30874SSatish Balay       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
27728e30874SSatish Balay       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
27828e30874SSatish Balay       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
27928e30874SSatish Balay       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
28028e30874SSatish Balay       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
28128e30874SSatish Balay       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
28228e30874SSatish Balay       v += 49;
28328e30874SSatish Balay     }
28428e30874SSatish Balay     idc    = 7 * c[i];
2859371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
2869371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
2879371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
2889371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
2899371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
2909371c9d4SSatish Balay     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
2919371c9d4SSatish Balay     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
29228e30874SSatish Balay   }
29328e30874SSatish Balay 
2949566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
2959566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
2969566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
2979566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
2989566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
2993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
30028e30874SSatish Balay }
30128e30874SSatish Balay 
302d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
303d71ae5a4SJacob Faibussowitsch {
30428e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
30528e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
30628e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
30728e30874SSatish Balay   const PetscInt    *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
30828e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
30928e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
31028e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
31128e30874SSatish Balay   const PetscScalar *b;
31228e30874SSatish Balay 
31328e30874SSatish Balay   PetscFunctionBegin;
3149566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
3159566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
31628e30874SSatish Balay   t = a->solve_work;
31728e30874SSatish Balay 
3189371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
3199371c9d4SSatish Balay   r = rout;
3209371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
3219371c9d4SSatish Balay   c = cout + (n - 1);
32228e30874SSatish Balay 
32328e30874SSatish Balay   /* forward solve the lower triangular */
32428e30874SSatish Balay   idx  = 6 * (*r++);
3259371c9d4SSatish Balay   t[0] = b[idx];
3269371c9d4SSatish Balay   t[1] = b[1 + idx];
3279371c9d4SSatish Balay   t[2] = b[2 + idx];
3289371c9d4SSatish Balay   t[3] = b[3 + idx];
3299371c9d4SSatish Balay   t[4] = b[4 + idx];
3309371c9d4SSatish Balay   t[5] = b[5 + idx];
33128e30874SSatish Balay   for (i = 1; i < n; i++) {
33228e30874SSatish Balay     v   = aa + 36 * ai[i];
33328e30874SSatish Balay     vi  = aj + ai[i];
33428e30874SSatish Balay     nz  = diag[i] - ai[i];
33528e30874SSatish Balay     idx = 6 * (*r++);
3369371c9d4SSatish Balay     s1  = b[idx];
3379371c9d4SSatish Balay     s2  = b[1 + idx];
3389371c9d4SSatish Balay     s3  = b[2 + idx];
3399371c9d4SSatish Balay     s4  = b[3 + idx];
3409371c9d4SSatish Balay     s5  = b[4 + idx];
3419371c9d4SSatish Balay     s6  = b[5 + idx];
34228e30874SSatish Balay     while (nz--) {
34328e30874SSatish Balay       idx = 6 * (*vi++);
3449371c9d4SSatish Balay       x1  = t[idx];
3459371c9d4SSatish Balay       x2  = t[1 + idx];
3469371c9d4SSatish Balay       x3  = t[2 + idx];
3479371c9d4SSatish Balay       x4  = t[3 + idx];
3489371c9d4SSatish Balay       x5  = t[4 + idx];
3499371c9d4SSatish Balay       x6  = t[5 + idx];
35028e30874SSatish Balay       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
35128e30874SSatish Balay       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
35228e30874SSatish Balay       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
35328e30874SSatish Balay       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
35428e30874SSatish Balay       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
35528e30874SSatish Balay       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
35628e30874SSatish Balay       v += 36;
35728e30874SSatish Balay     }
35828e30874SSatish Balay     idx        = 6 * i;
3599371c9d4SSatish Balay     t[idx]     = s1;
3609371c9d4SSatish Balay     t[1 + idx] = s2;
3619371c9d4SSatish Balay     t[2 + idx] = s3;
3629371c9d4SSatish Balay     t[3 + idx] = s4;
3639371c9d4SSatish Balay     t[4 + idx] = s5;
3649371c9d4SSatish Balay     t[5 + idx] = s6;
36528e30874SSatish Balay   }
36628e30874SSatish Balay   /* backward solve the upper triangular */
36728e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
36828e30874SSatish Balay     v   = aa + 36 * diag[i] + 36;
36928e30874SSatish Balay     vi  = aj + diag[i] + 1;
37028e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
37128e30874SSatish Balay     idt = 6 * i;
3729371c9d4SSatish Balay     s1  = t[idt];
3739371c9d4SSatish Balay     s2  = t[1 + idt];
3749371c9d4SSatish Balay     s3  = t[2 + idt];
3759371c9d4SSatish Balay     s4  = t[3 + idt];
3769371c9d4SSatish Balay     s5  = t[4 + idt];
3779371c9d4SSatish Balay     s6  = t[5 + idt];
37828e30874SSatish Balay     while (nz--) {
37928e30874SSatish Balay       idx = 6 * (*vi++);
3809371c9d4SSatish Balay       x1  = t[idx];
3819371c9d4SSatish Balay       x2  = t[1 + idx];
3829371c9d4SSatish Balay       x3  = t[2 + idx];
3839371c9d4SSatish Balay       x4  = t[3 + idx];
3849371c9d4SSatish Balay       x5  = t[4 + idx];
3859371c9d4SSatish Balay       x6  = t[5 + idx];
38628e30874SSatish Balay       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
38728e30874SSatish Balay       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
38828e30874SSatish Balay       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
38928e30874SSatish Balay       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
39028e30874SSatish Balay       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
39128e30874SSatish Balay       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
39228e30874SSatish Balay       v += 36;
39328e30874SSatish Balay     }
39428e30874SSatish Balay     idc    = 6 * (*c--);
39528e30874SSatish Balay     v      = aa + 36 * diag[i];
3969371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
3979371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
3989371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
3999371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
4009371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
4019371c9d4SSatish Balay     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
40228e30874SSatish Balay   }
40328e30874SSatish Balay 
4049566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
4059566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
4069566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
4079566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
4089566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
4093ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
41028e30874SSatish Balay }
41128e30874SSatish Balay 
412d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
413d71ae5a4SJacob Faibussowitsch {
41428e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
41528e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
41628e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
41728e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
41828e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
41928e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
42028e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
42128e30874SSatish Balay   const PetscScalar *b;
42228e30874SSatish Balay 
42328e30874SSatish Balay   PetscFunctionBegin;
4249566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
4259566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
42628e30874SSatish Balay   t = a->solve_work;
42728e30874SSatish Balay 
4289371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
4299371c9d4SSatish Balay   r = rout;
4309371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
4319371c9d4SSatish Balay   c = cout;
43228e30874SSatish Balay 
43328e30874SSatish Balay   /* forward solve the lower triangular */
43428e30874SSatish Balay   idx  = 6 * r[0];
4359371c9d4SSatish Balay   t[0] = b[idx];
4369371c9d4SSatish Balay   t[1] = b[1 + idx];
4379371c9d4SSatish Balay   t[2] = b[2 + idx];
4389371c9d4SSatish Balay   t[3] = b[3 + idx];
4399371c9d4SSatish Balay   t[4] = b[4 + idx];
4409371c9d4SSatish Balay   t[5] = b[5 + idx];
44128e30874SSatish Balay   for (i = 1; i < n; i++) {
44228e30874SSatish Balay     v   = aa + 36 * ai[i];
44328e30874SSatish Balay     vi  = aj + ai[i];
44428e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
44528e30874SSatish Balay     idx = 6 * r[i];
4469371c9d4SSatish Balay     s1  = b[idx];
4479371c9d4SSatish Balay     s2  = b[1 + idx];
4489371c9d4SSatish Balay     s3  = b[2 + idx];
4499371c9d4SSatish Balay     s4  = b[3 + idx];
4509371c9d4SSatish Balay     s5  = b[4 + idx];
4519371c9d4SSatish Balay     s6  = b[5 + idx];
45228e30874SSatish Balay     for (m = 0; m < nz; m++) {
45328e30874SSatish Balay       idx = 6 * vi[m];
4549371c9d4SSatish Balay       x1  = t[idx];
4559371c9d4SSatish Balay       x2  = t[1 + idx];
4569371c9d4SSatish Balay       x3  = t[2 + idx];
4579371c9d4SSatish Balay       x4  = t[3 + idx];
4589371c9d4SSatish Balay       x5  = t[4 + idx];
4599371c9d4SSatish Balay       x6  = t[5 + idx];
46028e30874SSatish Balay       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
46128e30874SSatish Balay       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
46228e30874SSatish Balay       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
46328e30874SSatish Balay       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
46428e30874SSatish Balay       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
46528e30874SSatish Balay       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
46628e30874SSatish Balay       v += 36;
46728e30874SSatish Balay     }
46828e30874SSatish Balay     idx        = 6 * i;
4699371c9d4SSatish Balay     t[idx]     = s1;
4709371c9d4SSatish Balay     t[1 + idx] = s2;
4719371c9d4SSatish Balay     t[2 + idx] = s3;
4729371c9d4SSatish Balay     t[3 + idx] = s4;
4739371c9d4SSatish Balay     t[4 + idx] = s5;
4749371c9d4SSatish Balay     t[5 + idx] = s6;
47528e30874SSatish Balay   }
47628e30874SSatish Balay   /* backward solve the upper triangular */
47728e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
47828e30874SSatish Balay     v   = aa + 36 * (adiag[i + 1] + 1);
47928e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
48028e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
48128e30874SSatish Balay     idt = 6 * i;
4829371c9d4SSatish Balay     s1  = t[idt];
4839371c9d4SSatish Balay     s2  = t[1 + idt];
4849371c9d4SSatish Balay     s3  = t[2 + idt];
4859371c9d4SSatish Balay     s4  = t[3 + idt];
4869371c9d4SSatish Balay     s5  = t[4 + idt];
4879371c9d4SSatish Balay     s6  = t[5 + idt];
48828e30874SSatish Balay     for (m = 0; m < nz; m++) {
48928e30874SSatish Balay       idx = 6 * vi[m];
4909371c9d4SSatish Balay       x1  = t[idx];
4919371c9d4SSatish Balay       x2  = t[1 + idx];
4929371c9d4SSatish Balay       x3  = t[2 + idx];
4939371c9d4SSatish Balay       x4  = t[3 + idx];
4949371c9d4SSatish Balay       x5  = t[4 + idx];
4959371c9d4SSatish Balay       x6  = t[5 + idx];
49628e30874SSatish Balay       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
49728e30874SSatish Balay       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
49828e30874SSatish Balay       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
49928e30874SSatish Balay       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
50028e30874SSatish Balay       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
50128e30874SSatish Balay       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
50228e30874SSatish Balay       v += 36;
50328e30874SSatish Balay     }
50428e30874SSatish Balay     idc    = 6 * c[i];
5059371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
5069371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
5079371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
5089371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
5099371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
5109371c9d4SSatish Balay     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
51128e30874SSatish Balay   }
51228e30874SSatish Balay 
5139566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
5149566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
5159566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
5169566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
5179566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
5183ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
51928e30874SSatish Balay }
52028e30874SSatish Balay 
521d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
522d71ae5a4SJacob Faibussowitsch {
52328e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
52428e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
52528e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout, *diag = a->diag;
52628e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
52728e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
52828e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
52928e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
53028e30874SSatish Balay   const PetscScalar *b;
53128e30874SSatish Balay 
53228e30874SSatish Balay   PetscFunctionBegin;
5339566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
5349566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
53528e30874SSatish Balay   t = a->solve_work;
53628e30874SSatish Balay 
5379371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
5389371c9d4SSatish Balay   r = rout;
5399371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
5409371c9d4SSatish Balay   c = cout + (n - 1);
54128e30874SSatish Balay 
54228e30874SSatish Balay   /* forward solve the lower triangular */
54328e30874SSatish Balay   idx  = 5 * (*r++);
5449371c9d4SSatish Balay   t[0] = b[idx];
5459371c9d4SSatish Balay   t[1] = b[1 + idx];
5469371c9d4SSatish Balay   t[2] = b[2 + idx];
5479371c9d4SSatish Balay   t[3] = b[3 + idx];
5489371c9d4SSatish Balay   t[4] = b[4 + idx];
54928e30874SSatish Balay   for (i = 1; i < n; i++) {
55028e30874SSatish Balay     v   = aa + 25 * ai[i];
55128e30874SSatish Balay     vi  = aj + ai[i];
55228e30874SSatish Balay     nz  = diag[i] - ai[i];
55328e30874SSatish Balay     idx = 5 * (*r++);
5549371c9d4SSatish Balay     s1  = b[idx];
5559371c9d4SSatish Balay     s2  = b[1 + idx];
5569371c9d4SSatish Balay     s3  = b[2 + idx];
5579371c9d4SSatish Balay     s4  = b[3 + idx];
55828e30874SSatish Balay     s5  = b[4 + idx];
55928e30874SSatish Balay     while (nz--) {
56028e30874SSatish Balay       idx = 5 * (*vi++);
5619371c9d4SSatish Balay       x1  = t[idx];
5629371c9d4SSatish Balay       x2  = t[1 + idx];
5639371c9d4SSatish Balay       x3  = t[2 + idx];
5649371c9d4SSatish Balay       x4  = t[3 + idx];
5659371c9d4SSatish Balay       x5  = t[4 + idx];
56628e30874SSatish Balay       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
56728e30874SSatish Balay       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
56828e30874SSatish Balay       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
56928e30874SSatish Balay       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
57028e30874SSatish Balay       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
57128e30874SSatish Balay       v += 25;
57228e30874SSatish Balay     }
57328e30874SSatish Balay     idx        = 5 * i;
5749371c9d4SSatish Balay     t[idx]     = s1;
5759371c9d4SSatish Balay     t[1 + idx] = s2;
5769371c9d4SSatish Balay     t[2 + idx] = s3;
5779371c9d4SSatish Balay     t[3 + idx] = s4;
5789371c9d4SSatish Balay     t[4 + idx] = s5;
57928e30874SSatish Balay   }
58028e30874SSatish Balay   /* backward solve the upper triangular */
58128e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
58228e30874SSatish Balay     v   = aa + 25 * diag[i] + 25;
58328e30874SSatish Balay     vi  = aj + diag[i] + 1;
58428e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
58528e30874SSatish Balay     idt = 5 * i;
5869371c9d4SSatish Balay     s1  = t[idt];
5879371c9d4SSatish Balay     s2  = t[1 + idt];
5889371c9d4SSatish Balay     s3  = t[2 + idt];
5899371c9d4SSatish Balay     s4  = t[3 + idt];
5909371c9d4SSatish Balay     s5  = t[4 + idt];
59128e30874SSatish Balay     while (nz--) {
59228e30874SSatish Balay       idx = 5 * (*vi++);
5939371c9d4SSatish Balay       x1  = t[idx];
5949371c9d4SSatish Balay       x2  = t[1 + idx];
5959371c9d4SSatish Balay       x3  = t[2 + idx];
5969371c9d4SSatish Balay       x4  = t[3 + idx];
5979371c9d4SSatish Balay       x5  = t[4 + idx];
59828e30874SSatish Balay       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
59928e30874SSatish Balay       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
60028e30874SSatish Balay       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
60128e30874SSatish Balay       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
60228e30874SSatish Balay       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
60328e30874SSatish Balay       v += 25;
60428e30874SSatish Balay     }
60528e30874SSatish Balay     idc    = 5 * (*c--);
60628e30874SSatish Balay     v      = aa + 25 * diag[i];
6079371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
6089371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
6099371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
6109371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
6119371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
61228e30874SSatish Balay   }
61328e30874SSatish Balay 
6149566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
6159566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
6169566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
6179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
6189566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
6193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
62028e30874SSatish Balay }
62128e30874SSatish Balay 
622d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
623d71ae5a4SJacob Faibussowitsch {
62428e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
62528e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
62628e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
62728e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
62828e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
62928e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
63028e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
63128e30874SSatish Balay   const PetscScalar *b;
63228e30874SSatish Balay 
63328e30874SSatish Balay   PetscFunctionBegin;
6349566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
6359566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
63628e30874SSatish Balay   t = a->solve_work;
63728e30874SSatish Balay 
6389371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
6399371c9d4SSatish Balay   r = rout;
6409371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
6419371c9d4SSatish Balay   c = cout;
64228e30874SSatish Balay 
64328e30874SSatish Balay   /* forward solve the lower triangular */
64428e30874SSatish Balay   idx  = 5 * r[0];
6459371c9d4SSatish Balay   t[0] = b[idx];
6469371c9d4SSatish Balay   t[1] = b[1 + idx];
6479371c9d4SSatish Balay   t[2] = b[2 + idx];
6489371c9d4SSatish Balay   t[3] = b[3 + idx];
6499371c9d4SSatish Balay   t[4] = b[4 + idx];
65028e30874SSatish Balay   for (i = 1; i < n; i++) {
65128e30874SSatish Balay     v   = aa + 25 * ai[i];
65228e30874SSatish Balay     vi  = aj + ai[i];
65328e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
65428e30874SSatish Balay     idx = 5 * r[i];
6559371c9d4SSatish Balay     s1  = b[idx];
6569371c9d4SSatish Balay     s2  = b[1 + idx];
6579371c9d4SSatish Balay     s3  = b[2 + idx];
6589371c9d4SSatish Balay     s4  = b[3 + idx];
65928e30874SSatish Balay     s5  = b[4 + idx];
66028e30874SSatish Balay     for (m = 0; m < nz; m++) {
66128e30874SSatish Balay       idx = 5 * vi[m];
6629371c9d4SSatish Balay       x1  = t[idx];
6639371c9d4SSatish Balay       x2  = t[1 + idx];
6649371c9d4SSatish Balay       x3  = t[2 + idx];
6659371c9d4SSatish Balay       x4  = t[3 + idx];
6669371c9d4SSatish Balay       x5  = t[4 + idx];
66728e30874SSatish Balay       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
66828e30874SSatish Balay       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
66928e30874SSatish Balay       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
67028e30874SSatish Balay       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
67128e30874SSatish Balay       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
67228e30874SSatish Balay       v += 25;
67328e30874SSatish Balay     }
67428e30874SSatish Balay     idx        = 5 * i;
6759371c9d4SSatish Balay     t[idx]     = s1;
6769371c9d4SSatish Balay     t[1 + idx] = s2;
6779371c9d4SSatish Balay     t[2 + idx] = s3;
6789371c9d4SSatish Balay     t[3 + idx] = s4;
6799371c9d4SSatish Balay     t[4 + idx] = s5;
68028e30874SSatish Balay   }
68128e30874SSatish Balay   /* backward solve the upper triangular */
68228e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
68328e30874SSatish Balay     v   = aa + 25 * (adiag[i + 1] + 1);
68428e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
68528e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
68628e30874SSatish Balay     idt = 5 * i;
6879371c9d4SSatish Balay     s1  = t[idt];
6889371c9d4SSatish Balay     s2  = t[1 + idt];
6899371c9d4SSatish Balay     s3  = t[2 + idt];
6909371c9d4SSatish Balay     s4  = t[3 + idt];
6919371c9d4SSatish Balay     s5  = t[4 + idt];
69228e30874SSatish Balay     for (m = 0; m < nz; m++) {
69328e30874SSatish Balay       idx = 5 * vi[m];
6949371c9d4SSatish Balay       x1  = t[idx];
6959371c9d4SSatish Balay       x2  = t[1 + idx];
6969371c9d4SSatish Balay       x3  = t[2 + idx];
6979371c9d4SSatish Balay       x4  = t[3 + idx];
6989371c9d4SSatish Balay       x5  = t[4 + idx];
69928e30874SSatish Balay       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
70028e30874SSatish Balay       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
70128e30874SSatish Balay       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
70228e30874SSatish Balay       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
70328e30874SSatish Balay       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
70428e30874SSatish Balay       v += 25;
70528e30874SSatish Balay     }
70628e30874SSatish Balay     idc    = 5 * c[i];
7079371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
7089371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
7099371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
7109371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
7119371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
71228e30874SSatish Balay   }
71328e30874SSatish Balay 
7149566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
7159566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
7169566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
7179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
7189566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
7193ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
72028e30874SSatish Balay }
72128e30874SSatish Balay 
722d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
723d71ae5a4SJacob Faibussowitsch {
72428e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
72528e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
72628e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
72728e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
72828e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
72928e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
73028e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
73128e30874SSatish Balay   const PetscScalar *b;
73228e30874SSatish Balay 
73328e30874SSatish Balay   PetscFunctionBegin;
7349566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
7359566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
73628e30874SSatish Balay   t = a->solve_work;
73728e30874SSatish Balay 
7389371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
7399371c9d4SSatish Balay   r = rout;
7409371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
7419371c9d4SSatish Balay   c = cout + (n - 1);
74228e30874SSatish Balay 
74328e30874SSatish Balay   /* forward solve the lower triangular */
74428e30874SSatish Balay   idx  = 4 * (*r++);
7459371c9d4SSatish Balay   t[0] = b[idx];
7469371c9d4SSatish Balay   t[1] = b[1 + idx];
7479371c9d4SSatish Balay   t[2] = b[2 + idx];
7489371c9d4SSatish Balay   t[3] = b[3 + idx];
74928e30874SSatish Balay   for (i = 1; i < n; i++) {
75028e30874SSatish Balay     v   = aa + 16 * ai[i];
75128e30874SSatish Balay     vi  = aj + ai[i];
75228e30874SSatish Balay     nz  = diag[i] - ai[i];
75328e30874SSatish Balay     idx = 4 * (*r++);
7549371c9d4SSatish Balay     s1  = b[idx];
7559371c9d4SSatish Balay     s2  = b[1 + idx];
7569371c9d4SSatish Balay     s3  = b[2 + idx];
7579371c9d4SSatish Balay     s4  = b[3 + idx];
75828e30874SSatish Balay     while (nz--) {
75928e30874SSatish Balay       idx = 4 * (*vi++);
7609371c9d4SSatish Balay       x1  = t[idx];
7619371c9d4SSatish Balay       x2  = t[1 + idx];
7629371c9d4SSatish Balay       x3  = t[2 + idx];
7639371c9d4SSatish Balay       x4  = t[3 + idx];
76428e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
76528e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
76628e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
76728e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
76828e30874SSatish Balay       v += 16;
76928e30874SSatish Balay     }
77028e30874SSatish Balay     idx        = 4 * i;
7719371c9d4SSatish Balay     t[idx]     = s1;
7729371c9d4SSatish Balay     t[1 + idx] = s2;
7739371c9d4SSatish Balay     t[2 + idx] = s3;
7749371c9d4SSatish Balay     t[3 + idx] = s4;
77528e30874SSatish Balay   }
77628e30874SSatish Balay   /* backward solve the upper triangular */
77728e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
77828e30874SSatish Balay     v   = aa + 16 * diag[i] + 16;
77928e30874SSatish Balay     vi  = aj + diag[i] + 1;
78028e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
78128e30874SSatish Balay     idt = 4 * i;
7829371c9d4SSatish Balay     s1  = t[idt];
7839371c9d4SSatish Balay     s2  = t[1 + idt];
7849371c9d4SSatish Balay     s3  = t[2 + idt];
7859371c9d4SSatish Balay     s4  = t[3 + idt];
78628e30874SSatish Balay     while (nz--) {
78728e30874SSatish Balay       idx = 4 * (*vi++);
7889371c9d4SSatish Balay       x1  = t[idx];
7899371c9d4SSatish Balay       x2  = t[1 + idx];
7909371c9d4SSatish Balay       x3  = t[2 + idx];
7919371c9d4SSatish Balay       x4  = t[3 + idx];
79228e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
79328e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
79428e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
79528e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
79628e30874SSatish Balay       v += 16;
79728e30874SSatish Balay     }
79828e30874SSatish Balay     idc    = 4 * (*c--);
79928e30874SSatish Balay     v      = aa + 16 * diag[i];
80028e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
80128e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
80228e30874SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
80328e30874SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
80428e30874SSatish Balay   }
80528e30874SSatish Balay 
8069566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
8079566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
8089566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
8099566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
8109566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
8113ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
81228e30874SSatish Balay }
81328e30874SSatish Balay 
814d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
815d71ae5a4SJacob Faibussowitsch {
81628e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
81728e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
81828e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
81928e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
82028e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
82128e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
82228e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
82328e30874SSatish Balay   const PetscScalar *b;
82428e30874SSatish Balay 
82528e30874SSatish Balay   PetscFunctionBegin;
8269566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
8279566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
82828e30874SSatish Balay   t = a->solve_work;
82928e30874SSatish Balay 
8309371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
8319371c9d4SSatish Balay   r = rout;
8329371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
8339371c9d4SSatish Balay   c = cout;
83428e30874SSatish Balay 
83528e30874SSatish Balay   /* forward solve the lower triangular */
83628e30874SSatish Balay   idx  = 4 * r[0];
8379371c9d4SSatish Balay   t[0] = b[idx];
8389371c9d4SSatish Balay   t[1] = b[1 + idx];
8399371c9d4SSatish Balay   t[2] = b[2 + idx];
8409371c9d4SSatish Balay   t[3] = b[3 + idx];
84128e30874SSatish Balay   for (i = 1; i < n; i++) {
84228e30874SSatish Balay     v   = aa + 16 * ai[i];
84328e30874SSatish Balay     vi  = aj + ai[i];
84428e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
84528e30874SSatish Balay     idx = 4 * r[i];
8469371c9d4SSatish Balay     s1  = b[idx];
8479371c9d4SSatish Balay     s2  = b[1 + idx];
8489371c9d4SSatish Balay     s3  = b[2 + idx];
8499371c9d4SSatish Balay     s4  = b[3 + idx];
85028e30874SSatish Balay     for (m = 0; m < nz; m++) {
85128e30874SSatish Balay       idx = 4 * vi[m];
8529371c9d4SSatish Balay       x1  = t[idx];
8539371c9d4SSatish Balay       x2  = t[1 + idx];
8549371c9d4SSatish Balay       x3  = t[2 + idx];
8559371c9d4SSatish Balay       x4  = t[3 + idx];
85628e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
85728e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
85828e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
85928e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
86028e30874SSatish Balay       v += 16;
86128e30874SSatish Balay     }
86228e30874SSatish Balay     idx        = 4 * i;
8639371c9d4SSatish Balay     t[idx]     = s1;
8649371c9d4SSatish Balay     t[1 + idx] = s2;
8659371c9d4SSatish Balay     t[2 + idx] = s3;
8669371c9d4SSatish Balay     t[3 + idx] = s4;
86728e30874SSatish Balay   }
86828e30874SSatish Balay   /* backward solve the upper triangular */
86928e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
87028e30874SSatish Balay     v   = aa + 16 * (adiag[i + 1] + 1);
87128e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
87228e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
87328e30874SSatish Balay     idt = 4 * i;
8749371c9d4SSatish Balay     s1  = t[idt];
8759371c9d4SSatish Balay     s2  = t[1 + idt];
8769371c9d4SSatish Balay     s3  = t[2 + idt];
8779371c9d4SSatish Balay     s4  = t[3 + idt];
87828e30874SSatish Balay     for (m = 0; m < nz; m++) {
87928e30874SSatish Balay       idx = 4 * vi[m];
8809371c9d4SSatish Balay       x1  = t[idx];
8819371c9d4SSatish Balay       x2  = t[1 + idx];
8829371c9d4SSatish Balay       x3  = t[2 + idx];
8839371c9d4SSatish Balay       x4  = t[3 + idx];
88428e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
88528e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
88628e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
88728e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
88828e30874SSatish Balay       v += 16;
88928e30874SSatish Balay     }
89028e30874SSatish Balay     idc    = 4 * c[i];
89128e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
89228e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
89328e30874SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
89428e30874SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
89528e30874SSatish Balay   }
89628e30874SSatish Balay 
8979566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
8989566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
8999566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
9009566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
9019566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
9023ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
90328e30874SSatish Balay }
90428e30874SSatish Balay 
905d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx)
906d71ae5a4SJacob Faibussowitsch {
90728e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
90828e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
90928e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
91028e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
91128e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
91228e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
91328e30874SSatish Balay   MatScalar          s1, s2, s3, s4, x1, x2, x3, x4, *t;
91428e30874SSatish Balay   PetscScalar       *x;
91528e30874SSatish Balay   const PetscScalar *b;
91628e30874SSatish Balay 
91728e30874SSatish Balay   PetscFunctionBegin;
9189566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
9199566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
92028e30874SSatish Balay   t = (MatScalar *)a->solve_work;
92128e30874SSatish Balay 
9229371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
9239371c9d4SSatish Balay   r = rout;
9249371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
9259371c9d4SSatish Balay   c = cout + (n - 1);
92628e30874SSatish Balay 
92728e30874SSatish Balay   /* forward solve the lower triangular */
92828e30874SSatish Balay   idx  = 4 * (*r++);
92928e30874SSatish Balay   t[0] = (MatScalar)b[idx];
93028e30874SSatish Balay   t[1] = (MatScalar)b[1 + idx];
93128e30874SSatish Balay   t[2] = (MatScalar)b[2 + idx];
93228e30874SSatish Balay   t[3] = (MatScalar)b[3 + idx];
93328e30874SSatish Balay   for (i = 1; i < n; i++) {
93428e30874SSatish Balay     v   = aa + 16 * ai[i];
93528e30874SSatish Balay     vi  = aj + ai[i];
93628e30874SSatish Balay     nz  = diag[i] - ai[i];
93728e30874SSatish Balay     idx = 4 * (*r++);
93828e30874SSatish Balay     s1  = (MatScalar)b[idx];
93928e30874SSatish Balay     s2  = (MatScalar)b[1 + idx];
94028e30874SSatish Balay     s3  = (MatScalar)b[2 + idx];
94128e30874SSatish Balay     s4  = (MatScalar)b[3 + idx];
94228e30874SSatish Balay     while (nz--) {
94328e30874SSatish Balay       idx = 4 * (*vi++);
94428e30874SSatish Balay       x1  = t[idx];
94528e30874SSatish Balay       x2  = t[1 + idx];
94628e30874SSatish Balay       x3  = t[2 + idx];
94728e30874SSatish Balay       x4  = t[3 + idx];
94828e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
94928e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
95028e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
95128e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
95228e30874SSatish Balay       v += 16;
95328e30874SSatish Balay     }
95428e30874SSatish Balay     idx        = 4 * i;
95528e30874SSatish Balay     t[idx]     = s1;
95628e30874SSatish Balay     t[1 + idx] = s2;
95728e30874SSatish Balay     t[2 + idx] = s3;
95828e30874SSatish Balay     t[3 + idx] = s4;
95928e30874SSatish Balay   }
96028e30874SSatish Balay   /* backward solve the upper triangular */
96128e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
96228e30874SSatish Balay     v   = aa + 16 * diag[i] + 16;
96328e30874SSatish Balay     vi  = aj + diag[i] + 1;
96428e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
96528e30874SSatish Balay     idt = 4 * i;
96628e30874SSatish Balay     s1  = t[idt];
96728e30874SSatish Balay     s2  = t[1 + idt];
96828e30874SSatish Balay     s3  = t[2 + idt];
96928e30874SSatish Balay     s4  = t[3 + idt];
97028e30874SSatish Balay     while (nz--) {
97128e30874SSatish Balay       idx = 4 * (*vi++);
97228e30874SSatish Balay       x1  = t[idx];
97328e30874SSatish Balay       x2  = t[1 + idx];
97428e30874SSatish Balay       x3  = t[2 + idx];
97528e30874SSatish Balay       x4  = t[3 + idx];
97628e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
97728e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
97828e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
97928e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
98028e30874SSatish Balay       v += 16;
98128e30874SSatish Balay     }
98228e30874SSatish Balay     idc        = 4 * (*c--);
98328e30874SSatish Balay     v          = aa + 16 * diag[i];
98428e30874SSatish Balay     t[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
98528e30874SSatish Balay     t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
98628e30874SSatish Balay     t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
98728e30874SSatish Balay     t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
98828e30874SSatish Balay     x[idc]     = (PetscScalar)t[idt];
98928e30874SSatish Balay     x[1 + idc] = (PetscScalar)t[1 + idt];
99028e30874SSatish Balay     x[2 + idc] = (PetscScalar)t[2 + idt];
99128e30874SSatish Balay     x[3 + idc] = (PetscScalar)t[3 + idt];
99228e30874SSatish Balay   }
99328e30874SSatish Balay 
9949566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
9959566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
9969566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
9979566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
9989566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
9993ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
100028e30874SSatish Balay }
100128e30874SSatish Balay 
100228e30874SSatish Balay #if defined(PETSC_HAVE_SSE)
100328e30874SSatish Balay 
100428e30874SSatish Balay   #include PETSC_HAVE_SSE
100528e30874SSatish Balay 
1006d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx)
1007d71ae5a4SJacob Faibussowitsch {
100828e30874SSatish Balay   /*
100928e30874SSatish Balay      Note: This code uses demotion of double
101028e30874SSatish Balay      to float when performing the mixed-mode computation.
101128e30874SSatish Balay      This may not be numerically reasonable for all applications.
101228e30874SSatish Balay   */
101328e30874SSatish Balay   Mat_SeqBAIJ    *a     = (Mat_SeqBAIJ *)A->data;
101428e30874SSatish Balay   IS              iscol = a->col, isrow = a->row;
101528e30874SSatish Balay   PetscInt        i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
101628e30874SSatish Balay   const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
101728e30874SSatish Balay   MatScalar      *aa = a->a, *v;
101828e30874SSatish Balay   PetscScalar    *x, *b, *t;
101928e30874SSatish Balay 
102028e30874SSatish Balay   /* Make space in temp stack for 16 Byte Aligned arrays */
102128e30874SSatish Balay   float         ssealignedspace[11], *tmps, *tmpx;
102228e30874SSatish Balay   unsigned long offset;
102328e30874SSatish Balay 
102428e30874SSatish Balay   PetscFunctionBegin;
102528e30874SSatish Balay   SSE_SCOPE_BEGIN;
102628e30874SSatish Balay 
102728e30874SSatish Balay   offset = (unsigned long)ssealignedspace % 16;
102828e30874SSatish Balay   if (offset) offset = (16 - offset) / 4;
102928e30874SSatish Balay   tmps = &ssealignedspace[offset];
103028e30874SSatish Balay   tmpx = &ssealignedspace[offset + 4];
103128e30874SSatish Balay   PREFETCH_NTA(aa + 16 * ai[1]);
103228e30874SSatish Balay 
10339566063dSJacob Faibussowitsch   PetscCall(VecGetArray(bb, &b));
10349566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
103528e30874SSatish Balay   t = a->solve_work;
103628e30874SSatish Balay 
10379371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
10389371c9d4SSatish Balay   r = rout;
10399371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
10409371c9d4SSatish Balay   c = cout + (n - 1);
104128e30874SSatish Balay 
104228e30874SSatish Balay   /* forward solve the lower triangular */
104328e30874SSatish Balay   idx  = 4 * (*r++);
10449371c9d4SSatish Balay   t[0] = b[idx];
10459371c9d4SSatish Balay   t[1] = b[1 + idx];
10469371c9d4SSatish Balay   t[2] = b[2 + idx];
10479371c9d4SSatish Balay   t[3] = b[3 + idx];
104828e30874SSatish Balay   v    = aa + 16 * ai[1];
104928e30874SSatish Balay 
105028e30874SSatish Balay   for (i = 1; i < n;) {
105128e30874SSatish Balay     PREFETCH_NTA(&v[8]);
105228e30874SSatish Balay     vi  = aj + ai[i];
105328e30874SSatish Balay     nz  = diag[i] - ai[i];
105428e30874SSatish Balay     idx = 4 * (*r++);
105528e30874SSatish Balay 
105628e30874SSatish Balay     /* Demote sum from double to float */
105728e30874SSatish Balay     CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
105828e30874SSatish Balay     LOAD_PS(tmps, XMM7);
105928e30874SSatish Balay 
106028e30874SSatish Balay     while (nz--) {
106128e30874SSatish Balay       PREFETCH_NTA(&v[16]);
106228e30874SSatish Balay       idx = 4 * (*vi++);
106328e30874SSatish Balay 
106428e30874SSatish Balay       /* Demote solution (so far) from double to float */
106528e30874SSatish Balay       CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);
106628e30874SSatish Balay 
106728e30874SSatish Balay       /* 4x4 Matrix-Vector product with negative accumulation: */
106828e30874SSatish Balay       SSE_INLINE_BEGIN_2(tmpx, v)
106928e30874SSatish Balay       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
107028e30874SSatish Balay 
107128e30874SSatish Balay       /* First Column */
107228e30874SSatish Balay       SSE_COPY_PS(XMM0, XMM6)
107328e30874SSatish Balay       SSE_SHUFFLE(XMM0, XMM0, 0x00)
107428e30874SSatish Balay       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
107528e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM0)
107628e30874SSatish Balay 
107728e30874SSatish Balay       /* Second Column */
107828e30874SSatish Balay       SSE_COPY_PS(XMM1, XMM6)
107928e30874SSatish Balay       SSE_SHUFFLE(XMM1, XMM1, 0x55)
108028e30874SSatish Balay       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
108128e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM1)
108228e30874SSatish Balay 
108328e30874SSatish Balay       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
108428e30874SSatish Balay 
108528e30874SSatish Balay       /* Third Column */
108628e30874SSatish Balay       SSE_COPY_PS(XMM2, XMM6)
108728e30874SSatish Balay       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
108828e30874SSatish Balay       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
108928e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM2)
109028e30874SSatish Balay 
109128e30874SSatish Balay       /* Fourth Column */
109228e30874SSatish Balay       SSE_COPY_PS(XMM3, XMM6)
109328e30874SSatish Balay       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
109428e30874SSatish Balay       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
109528e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM3)
109628e30874SSatish Balay       SSE_INLINE_END_2
109728e30874SSatish Balay 
109828e30874SSatish Balay       v += 16;
109928e30874SSatish Balay     }
110028e30874SSatish Balay     idx = 4 * i;
110128e30874SSatish Balay     v   = aa + 16 * ai[++i];
110228e30874SSatish Balay     PREFETCH_NTA(v);
110328e30874SSatish Balay     STORE_PS(tmps, XMM7);
110428e30874SSatish Balay 
110528e30874SSatish Balay     /* Promote result from float to double */
110628e30874SSatish Balay     CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
110728e30874SSatish Balay   }
110828e30874SSatish Balay   /* backward solve the upper triangular */
110928e30874SSatish Balay   idt  = 4 * (n - 1);
111028e30874SSatish Balay   ai16 = 16 * diag[n - 1];
111128e30874SSatish Balay   v    = aa + ai16 + 16;
111228e30874SSatish Balay   for (i = n - 1; i >= 0;) {
111328e30874SSatish Balay     PREFETCH_NTA(&v[8]);
111428e30874SSatish Balay     vi = aj + diag[i] + 1;
111528e30874SSatish Balay     nz = ai[i + 1] - diag[i] - 1;
111628e30874SSatish Balay 
111728e30874SSatish Balay     /* Demote accumulator from double to float */
111828e30874SSatish Balay     CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
111928e30874SSatish Balay     LOAD_PS(tmps, XMM7);
112028e30874SSatish Balay 
112128e30874SSatish Balay     while (nz--) {
112228e30874SSatish Balay       PREFETCH_NTA(&v[16]);
112328e30874SSatish Balay       idx = 4 * (*vi++);
112428e30874SSatish Balay 
112528e30874SSatish Balay       /* Demote solution (so far) from double to float */
112628e30874SSatish Balay       CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);
112728e30874SSatish Balay 
112828e30874SSatish Balay       /* 4x4 Matrix-Vector Product with negative accumulation: */
112928e30874SSatish Balay       SSE_INLINE_BEGIN_2(tmpx, v)
113028e30874SSatish Balay       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
113128e30874SSatish Balay 
113228e30874SSatish Balay       /* First Column */
113328e30874SSatish Balay       SSE_COPY_PS(XMM0, XMM6)
113428e30874SSatish Balay       SSE_SHUFFLE(XMM0, XMM0, 0x00)
113528e30874SSatish Balay       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
113628e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM0)
113728e30874SSatish Balay 
113828e30874SSatish Balay       /* Second Column */
113928e30874SSatish Balay       SSE_COPY_PS(XMM1, XMM6)
114028e30874SSatish Balay       SSE_SHUFFLE(XMM1, XMM1, 0x55)
114128e30874SSatish Balay       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
114228e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM1)
114328e30874SSatish Balay 
114428e30874SSatish Balay       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
114528e30874SSatish Balay 
114628e30874SSatish Balay       /* Third Column */
114728e30874SSatish Balay       SSE_COPY_PS(XMM2, XMM6)
114828e30874SSatish Balay       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
114928e30874SSatish Balay       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
115028e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM2)
115128e30874SSatish Balay 
115228e30874SSatish Balay       /* Fourth Column */
115328e30874SSatish Balay       SSE_COPY_PS(XMM3, XMM6)
115428e30874SSatish Balay       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
115528e30874SSatish Balay       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
115628e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM3)
115728e30874SSatish Balay       SSE_INLINE_END_2
115828e30874SSatish Balay       v += 16;
115928e30874SSatish Balay     }
116028e30874SSatish Balay     v    = aa + ai16;
116128e30874SSatish Balay     ai16 = 16 * diag[--i];
116228e30874SSatish Balay     PREFETCH_NTA(aa + ai16 + 16);
116328e30874SSatish Balay     /*
116428e30874SSatish Balay        Scale the result by the diagonal 4x4 block,
116528e30874SSatish Balay        which was inverted as part of the factorization
116628e30874SSatish Balay     */
116728e30874SSatish Balay     SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
116828e30874SSatish Balay     /* First Column */
116928e30874SSatish Balay     SSE_COPY_PS(XMM0, XMM7)
117028e30874SSatish Balay     SSE_SHUFFLE(XMM0, XMM0, 0x00)
117128e30874SSatish Balay     SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
117228e30874SSatish Balay 
117328e30874SSatish Balay     /* Second Column */
117428e30874SSatish Balay     SSE_COPY_PS(XMM1, XMM7)
117528e30874SSatish Balay     SSE_SHUFFLE(XMM1, XMM1, 0x55)
117628e30874SSatish Balay     SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
117728e30874SSatish Balay     SSE_ADD_PS(XMM0, XMM1)
117828e30874SSatish Balay 
117928e30874SSatish Balay     SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
118028e30874SSatish Balay 
118128e30874SSatish Balay     /* Third Column */
118228e30874SSatish Balay     SSE_COPY_PS(XMM2, XMM7)
118328e30874SSatish Balay     SSE_SHUFFLE(XMM2, XMM2, 0xAA)
118428e30874SSatish Balay     SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
118528e30874SSatish Balay     SSE_ADD_PS(XMM0, XMM2)
118628e30874SSatish Balay 
118728e30874SSatish Balay     /* Fourth Column */
118828e30874SSatish Balay     SSE_COPY_PS(XMM3, XMM7)
118928e30874SSatish Balay     SSE_SHUFFLE(XMM3, XMM3, 0xFF)
119028e30874SSatish Balay     SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
119128e30874SSatish Balay     SSE_ADD_PS(XMM0, XMM3)
119228e30874SSatish Balay 
119328e30874SSatish Balay     SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
119428e30874SSatish Balay     SSE_INLINE_END_3
119528e30874SSatish Balay 
119628e30874SSatish Balay     /* Promote solution from float to double */
119728e30874SSatish Balay     CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);
119828e30874SSatish Balay 
119928e30874SSatish Balay     /* Apply reordering to t and stream into x.    */
120028e30874SSatish Balay     /* This way, x doesn't pollute the cache.      */
120128e30874SSatish Balay     /* Be careful with size: 2 doubles = 4 floats! */
120228e30874SSatish Balay     idc = 4 * (*c--);
120328e30874SSatish Balay     SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
120428e30874SSatish Balay     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
120528e30874SSatish Balay     SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
120628e30874SSatish Balay     SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
120728e30874SSatish Balay     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
120828e30874SSatish Balay     SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
120928e30874SSatish Balay     SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
121028e30874SSatish Balay     SSE_INLINE_END_2
121128e30874SSatish Balay     v = aa + ai16 + 16;
121228e30874SSatish Balay     idt -= 4;
121328e30874SSatish Balay   }
121428e30874SSatish Balay 
12159566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
12169566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
12179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(bb, &b));
12189566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
12199566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
122028e30874SSatish Balay   SSE_SCOPE_END;
12213ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
122228e30874SSatish Balay }
122328e30874SSatish Balay 
122428e30874SSatish Balay #endif
122528e30874SSatish Balay 
1226d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1227d71ae5a4SJacob Faibussowitsch {
122828e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
122928e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
123028e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
123128e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
123228e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
123328e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
123428e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
123528e30874SSatish Balay   const PetscScalar *b;
123628e30874SSatish Balay 
123728e30874SSatish Balay   PetscFunctionBegin;
12389566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
12399566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
124028e30874SSatish Balay   t = a->solve_work;
124128e30874SSatish Balay 
12429371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
12439371c9d4SSatish Balay   r = rout;
12449371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
12459371c9d4SSatish Balay   c = cout + (n - 1);
124628e30874SSatish Balay 
124728e30874SSatish Balay   /* forward solve the lower triangular */
124828e30874SSatish Balay   idx  = 3 * (*r++);
12499371c9d4SSatish Balay   t[0] = b[idx];
12509371c9d4SSatish Balay   t[1] = b[1 + idx];
12519371c9d4SSatish Balay   t[2] = b[2 + idx];
125228e30874SSatish Balay   for (i = 1; i < n; i++) {
125328e30874SSatish Balay     v   = aa + 9 * ai[i];
125428e30874SSatish Balay     vi  = aj + ai[i];
125528e30874SSatish Balay     nz  = diag[i] - ai[i];
125628e30874SSatish Balay     idx = 3 * (*r++);
12579371c9d4SSatish Balay     s1  = b[idx];
12589371c9d4SSatish Balay     s2  = b[1 + idx];
12599371c9d4SSatish Balay     s3  = b[2 + idx];
126028e30874SSatish Balay     while (nz--) {
126128e30874SSatish Balay       idx = 3 * (*vi++);
12629371c9d4SSatish Balay       x1  = t[idx];
12639371c9d4SSatish Balay       x2  = t[1 + idx];
12649371c9d4SSatish Balay       x3  = t[2 + idx];
126528e30874SSatish Balay       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
126628e30874SSatish Balay       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
126728e30874SSatish Balay       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
126828e30874SSatish Balay       v += 9;
126928e30874SSatish Balay     }
127028e30874SSatish Balay     idx        = 3 * i;
12719371c9d4SSatish Balay     t[idx]     = s1;
12729371c9d4SSatish Balay     t[1 + idx] = s2;
12739371c9d4SSatish Balay     t[2 + idx] = s3;
127428e30874SSatish Balay   }
127528e30874SSatish Balay   /* backward solve the upper triangular */
127628e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
127728e30874SSatish Balay     v   = aa + 9 * diag[i] + 9;
127828e30874SSatish Balay     vi  = aj + diag[i] + 1;
127928e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
128028e30874SSatish Balay     idt = 3 * i;
12819371c9d4SSatish Balay     s1  = t[idt];
12829371c9d4SSatish Balay     s2  = t[1 + idt];
12839371c9d4SSatish Balay     s3  = t[2 + idt];
128428e30874SSatish Balay     while (nz--) {
128528e30874SSatish Balay       idx = 3 * (*vi++);
12869371c9d4SSatish Balay       x1  = t[idx];
12879371c9d4SSatish Balay       x2  = t[1 + idx];
12889371c9d4SSatish Balay       x3  = t[2 + idx];
128928e30874SSatish Balay       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
129028e30874SSatish Balay       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
129128e30874SSatish Balay       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
129228e30874SSatish Balay       v += 9;
129328e30874SSatish Balay     }
129428e30874SSatish Balay     idc    = 3 * (*c--);
129528e30874SSatish Balay     v      = aa + 9 * diag[i];
129628e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
129728e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
129828e30874SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
129928e30874SSatish Balay   }
13009566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
13019566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
13029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
13039566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
13049566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
13053ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
130628e30874SSatish Balay }
130728e30874SSatish Balay 
1308d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
1309d71ae5a4SJacob Faibussowitsch {
131028e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
131128e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
131228e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
131328e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
131428e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
131528e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
131628e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
131728e30874SSatish Balay   const PetscScalar *b;
131828e30874SSatish Balay 
131928e30874SSatish Balay   PetscFunctionBegin;
13209566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
13219566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
132228e30874SSatish Balay   t = a->solve_work;
132328e30874SSatish Balay 
13249371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
13259371c9d4SSatish Balay   r = rout;
13269371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
13279371c9d4SSatish Balay   c = cout;
132828e30874SSatish Balay 
132928e30874SSatish Balay   /* forward solve the lower triangular */
133028e30874SSatish Balay   idx  = 3 * r[0];
13319371c9d4SSatish Balay   t[0] = b[idx];
13329371c9d4SSatish Balay   t[1] = b[1 + idx];
13339371c9d4SSatish Balay   t[2] = b[2 + idx];
133428e30874SSatish Balay   for (i = 1; i < n; i++) {
133528e30874SSatish Balay     v   = aa + 9 * ai[i];
133628e30874SSatish Balay     vi  = aj + ai[i];
133728e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
133828e30874SSatish Balay     idx = 3 * r[i];
13399371c9d4SSatish Balay     s1  = b[idx];
13409371c9d4SSatish Balay     s2  = b[1 + idx];
13419371c9d4SSatish Balay     s3  = b[2 + idx];
134228e30874SSatish Balay     for (m = 0; m < nz; m++) {
134328e30874SSatish Balay       idx = 3 * vi[m];
13449371c9d4SSatish Balay       x1  = t[idx];
13459371c9d4SSatish Balay       x2  = t[1 + idx];
13469371c9d4SSatish Balay       x3  = t[2 + idx];
134728e30874SSatish Balay       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
134828e30874SSatish Balay       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
134928e30874SSatish Balay       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
135028e30874SSatish Balay       v += 9;
135128e30874SSatish Balay     }
135228e30874SSatish Balay     idx        = 3 * i;
13539371c9d4SSatish Balay     t[idx]     = s1;
13549371c9d4SSatish Balay     t[1 + idx] = s2;
13559371c9d4SSatish Balay     t[2 + idx] = s3;
135628e30874SSatish Balay   }
135728e30874SSatish Balay   /* backward solve the upper triangular */
135828e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
135928e30874SSatish Balay     v   = aa + 9 * (adiag[i + 1] + 1);
136028e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
136128e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
136228e30874SSatish Balay     idt = 3 * i;
13639371c9d4SSatish Balay     s1  = t[idt];
13649371c9d4SSatish Balay     s2  = t[1 + idt];
13659371c9d4SSatish Balay     s3  = t[2 + idt];
136628e30874SSatish Balay     for (m = 0; m < nz; m++) {
136728e30874SSatish Balay       idx = 3 * vi[m];
13689371c9d4SSatish Balay       x1  = t[idx];
13699371c9d4SSatish Balay       x2  = t[1 + idx];
13709371c9d4SSatish Balay       x3  = t[2 + idx];
137128e30874SSatish Balay       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
137228e30874SSatish Balay       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
137328e30874SSatish Balay       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
137428e30874SSatish Balay       v += 9;
137528e30874SSatish Balay     }
137628e30874SSatish Balay     idc    = 3 * c[i];
137728e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
137828e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
137928e30874SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
138028e30874SSatish Balay   }
13819566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
13829566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
13839566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
13849566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
13859566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
13863ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
138728e30874SSatish Balay }
138828e30874SSatish Balay 
1389d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1390d71ae5a4SJacob Faibussowitsch {
139128e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
139228e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
139328e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
139428e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
139528e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
139628e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
139728e30874SSatish Balay   PetscScalar       *x, s1, s2, x1, x2, *t;
139828e30874SSatish Balay   const PetscScalar *b;
139928e30874SSatish Balay 
140028e30874SSatish Balay   PetscFunctionBegin;
14019566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
14029566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
140328e30874SSatish Balay   t = a->solve_work;
140428e30874SSatish Balay 
14059371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
14069371c9d4SSatish Balay   r = rout;
14079371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
14089371c9d4SSatish Balay   c = cout + (n - 1);
140928e30874SSatish Balay 
141028e30874SSatish Balay   /* forward solve the lower triangular */
141128e30874SSatish Balay   idx  = 2 * (*r++);
14129371c9d4SSatish Balay   t[0] = b[idx];
14139371c9d4SSatish Balay   t[1] = b[1 + idx];
141428e30874SSatish Balay   for (i = 1; i < n; i++) {
141528e30874SSatish Balay     v   = aa + 4 * ai[i];
141628e30874SSatish Balay     vi  = aj + ai[i];
141728e30874SSatish Balay     nz  = diag[i] - ai[i];
141828e30874SSatish Balay     idx = 2 * (*r++);
14199371c9d4SSatish Balay     s1  = b[idx];
14209371c9d4SSatish Balay     s2  = b[1 + idx];
142128e30874SSatish Balay     while (nz--) {
142228e30874SSatish Balay       idx = 2 * (*vi++);
14239371c9d4SSatish Balay       x1  = t[idx];
14249371c9d4SSatish Balay       x2  = t[1 + idx];
142528e30874SSatish Balay       s1 -= v[0] * x1 + v[2] * x2;
142628e30874SSatish Balay       s2 -= v[1] * x1 + v[3] * x2;
142728e30874SSatish Balay       v += 4;
142828e30874SSatish Balay     }
142928e30874SSatish Balay     idx        = 2 * i;
14309371c9d4SSatish Balay     t[idx]     = s1;
14319371c9d4SSatish Balay     t[1 + idx] = s2;
143228e30874SSatish Balay   }
143328e30874SSatish Balay   /* backward solve the upper triangular */
143428e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
143528e30874SSatish Balay     v   = aa + 4 * diag[i] + 4;
143628e30874SSatish Balay     vi  = aj + diag[i] + 1;
143728e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
143828e30874SSatish Balay     idt = 2 * i;
14399371c9d4SSatish Balay     s1  = t[idt];
14409371c9d4SSatish Balay     s2  = t[1 + idt];
144128e30874SSatish Balay     while (nz--) {
144228e30874SSatish Balay       idx = 2 * (*vi++);
14439371c9d4SSatish Balay       x1  = t[idx];
14449371c9d4SSatish Balay       x2  = t[1 + idx];
144528e30874SSatish Balay       s1 -= v[0] * x1 + v[2] * x2;
144628e30874SSatish Balay       s2 -= v[1] * x1 + v[3] * x2;
144728e30874SSatish Balay       v += 4;
144828e30874SSatish Balay     }
144928e30874SSatish Balay     idc    = 2 * (*c--);
145028e30874SSatish Balay     v      = aa + 4 * diag[i];
145128e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
145228e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
145328e30874SSatish Balay   }
14549566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
14559566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
14569566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
14579566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
14589566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
14593ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
146028e30874SSatish Balay }
146128e30874SSatish Balay 
1462d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1463d71ae5a4SJacob Faibussowitsch {
146428e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
146528e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
146628e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
146728e30874SSatish Balay   PetscInt           i, nz, idx, jdx, idt, idc, m;
146828e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
146928e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
147028e30874SSatish Balay   PetscScalar       *x, s1, s2, x1, x2, *t;
147128e30874SSatish Balay   const PetscScalar *b;
147228e30874SSatish Balay 
147328e30874SSatish Balay   PetscFunctionBegin;
14749566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
14759566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
147628e30874SSatish Balay   t = a->solve_work;
147728e30874SSatish Balay 
14789371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
14799371c9d4SSatish Balay   r = rout;
14809371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
14819371c9d4SSatish Balay   c = cout;
148228e30874SSatish Balay 
148328e30874SSatish Balay   /* forward solve the lower triangular */
148428e30874SSatish Balay   idx  = 2 * r[0];
14859371c9d4SSatish Balay   t[0] = b[idx];
14869371c9d4SSatish Balay   t[1] = b[1 + idx];
148728e30874SSatish Balay   for (i = 1; i < n; i++) {
148828e30874SSatish Balay     v   = aa + 4 * ai[i];
148928e30874SSatish Balay     vi  = aj + ai[i];
149028e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
149128e30874SSatish Balay     idx = 2 * r[i];
14929371c9d4SSatish Balay     s1  = b[idx];
14939371c9d4SSatish Balay     s2  = b[1 + idx];
149428e30874SSatish Balay     for (m = 0; m < nz; m++) {
149528e30874SSatish Balay       jdx = 2 * vi[m];
14969371c9d4SSatish Balay       x1  = t[jdx];
14979371c9d4SSatish Balay       x2  = t[1 + jdx];
149828e30874SSatish Balay       s1 -= v[0] * x1 + v[2] * x2;
149928e30874SSatish Balay       s2 -= v[1] * x1 + v[3] * x2;
150028e30874SSatish Balay       v += 4;
150128e30874SSatish Balay     }
150228e30874SSatish Balay     idx        = 2 * i;
15039371c9d4SSatish Balay     t[idx]     = s1;
15049371c9d4SSatish Balay     t[1 + idx] = s2;
150528e30874SSatish Balay   }
150628e30874SSatish Balay   /* backward solve the upper triangular */
150728e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
150828e30874SSatish Balay     v   = aa + 4 * (adiag[i + 1] + 1);
150928e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
151028e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
151128e30874SSatish Balay     idt = 2 * i;
15129371c9d4SSatish Balay     s1  = t[idt];
15139371c9d4SSatish Balay     s2  = t[1 + idt];
151428e30874SSatish Balay     for (m = 0; m < nz; m++) {
151528e30874SSatish Balay       idx = 2 * vi[m];
15169371c9d4SSatish Balay       x1  = t[idx];
15179371c9d4SSatish Balay       x2  = t[1 + idx];
151828e30874SSatish Balay       s1 -= v[0] * x1 + v[2] * x2;
151928e30874SSatish Balay       s2 -= v[1] * x1 + v[3] * x2;
152028e30874SSatish Balay       v += 4;
152128e30874SSatish Balay     }
152228e30874SSatish Balay     idc    = 2 * c[i];
152328e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
152428e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
152528e30874SSatish Balay   }
15269566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
15279566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
15289566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
15299566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
15309566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
15313ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
153228e30874SSatish Balay }
153328e30874SSatish Balay 
1534d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1535d71ae5a4SJacob Faibussowitsch {
153628e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
153728e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
153828e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
153928e30874SSatish Balay   PetscInt           i, nz;
154028e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
154128e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
154228e30874SSatish Balay   PetscScalar       *x, s1, *t;
154328e30874SSatish Balay   const PetscScalar *b;
154428e30874SSatish Balay 
154528e30874SSatish Balay   PetscFunctionBegin;
15463ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
154728e30874SSatish Balay 
15489566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
15499566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
155028e30874SSatish Balay   t = a->solve_work;
155128e30874SSatish Balay 
15529371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
15539371c9d4SSatish Balay   r = rout;
15549371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
15559371c9d4SSatish Balay   c = cout + (n - 1);
155628e30874SSatish Balay 
155728e30874SSatish Balay   /* forward solve the lower triangular */
155828e30874SSatish Balay   t[0] = b[*r++];
155928e30874SSatish Balay   for (i = 1; i < n; i++) {
156028e30874SSatish Balay     v  = aa + ai[i];
156128e30874SSatish Balay     vi = aj + ai[i];
156228e30874SSatish Balay     nz = diag[i] - ai[i];
156328e30874SSatish Balay     s1 = b[*r++];
1564ad540459SPierre Jolivet     while (nz--) s1 -= (*v++) * t[*vi++];
156528e30874SSatish Balay     t[i] = s1;
156628e30874SSatish Balay   }
156728e30874SSatish Balay   /* backward solve the upper triangular */
156828e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
156928e30874SSatish Balay     v  = aa + diag[i] + 1;
157028e30874SSatish Balay     vi = aj + diag[i] + 1;
157128e30874SSatish Balay     nz = ai[i + 1] - diag[i] - 1;
157228e30874SSatish Balay     s1 = t[i];
1573ad540459SPierre Jolivet     while (nz--) s1 -= (*v++) * t[*vi++];
157428e30874SSatish Balay     x[*c--] = t[i] = aa[diag[i]] * s1;
157528e30874SSatish Balay   }
157628e30874SSatish Balay 
15779566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
15789566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
15799566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
15809566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
15819566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
15823ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
158328e30874SSatish Balay }
158428e30874SSatish Balay 
1585d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1586d71ae5a4SJacob Faibussowitsch {
158728e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
158828e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
158928e30874SSatish Balay   PetscInt           i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
159028e30874SSatish Balay   const PetscInt    *rout, *cout, *r, *c;
159128e30874SSatish Balay   PetscScalar       *x, *tmp, sum;
159228e30874SSatish Balay   const PetscScalar *b;
159328e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
159428e30874SSatish Balay 
159528e30874SSatish Balay   PetscFunctionBegin;
15963ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
159728e30874SSatish Balay 
15989566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
15999566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
160028e30874SSatish Balay   tmp = a->solve_work;
160128e30874SSatish Balay 
16029371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
16039371c9d4SSatish Balay   r = rout;
16049371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
16059371c9d4SSatish Balay   c = cout;
160628e30874SSatish Balay 
160728e30874SSatish Balay   /* forward solve the lower triangular */
160828e30874SSatish Balay   tmp[0] = b[r[0]];
160928e30874SSatish Balay   v      = aa;
161028e30874SSatish Balay   vi     = aj;
161128e30874SSatish Balay   for (i = 1; i < n; i++) {
161228e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
161328e30874SSatish Balay     sum = b[r[i]];
161428e30874SSatish Balay     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
161528e30874SSatish Balay     tmp[i] = sum;
16169371c9d4SSatish Balay     v += nz;
16179371c9d4SSatish Balay     vi += nz;
161828e30874SSatish Balay   }
161928e30874SSatish Balay 
162028e30874SSatish Balay   /* backward solve the upper triangular */
162128e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
162228e30874SSatish Balay     v   = aa + adiag[i + 1] + 1;
162328e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
162428e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
162528e30874SSatish Balay     sum = tmp[i];
162628e30874SSatish Balay     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
162728e30874SSatish Balay     x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
162828e30874SSatish Balay   }
162928e30874SSatish Balay 
16309566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
16319566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
16329566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
16339566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
16349566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
16353ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
163628e30874SSatish Balay }
1637