xref: /petsc/src/mat/impls/baij/seq/baijsolv.c (revision 3ba1676111f5c958fe6c2729b46ca4d523958bb3)
128e30874SSatish Balay #include <../src/mat/impls/baij/seq/baij.h>
2af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h>
328e30874SSatish Balay 
4d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx)
5d71ae5a4SJacob Faibussowitsch {
628e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
728e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
828e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
928e30874SSatish Balay   const PetscInt     n = a->mbs, *ai = a->i, *aj = a->j, *vi;
1028e30874SSatish Balay   PetscInt           i, nz;
1128e30874SSatish Balay   const PetscInt     bs = A->rmap->bs, bs2 = a->bs2;
1228e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
1328e30874SSatish Balay   PetscScalar       *x, *s, *t, *ls;
1428e30874SSatish Balay   const PetscScalar *b;
1528e30874SSatish Balay 
1628e30874SSatish Balay   PetscFunctionBegin;
179566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
189566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
1928e30874SSatish Balay   t = a->solve_work;
2028e30874SSatish Balay 
219371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
229371c9d4SSatish Balay   r = rout;
239371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
249371c9d4SSatish Balay   c = cout + (n - 1);
2528e30874SSatish Balay 
2628e30874SSatish Balay   /* forward solve the lower triangular */
279566063dSJacob Faibussowitsch   PetscCall(PetscArraycpy(t, b + bs * (*r++), bs));
2828e30874SSatish Balay   for (i = 1; i < n; i++) {
2928e30874SSatish Balay     v  = aa + bs2 * ai[i];
3028e30874SSatish Balay     vi = aj + ai[i];
3128e30874SSatish Balay     nz = a->diag[i] - ai[i];
3228e30874SSatish Balay     s  = t + bs * i;
339566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(s, b + bs * (*r++), bs));
3428e30874SSatish Balay     while (nz--) {
3528e30874SSatish Balay       PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++));
3628e30874SSatish Balay       v += bs2;
3728e30874SSatish Balay     }
3828e30874SSatish Balay   }
3928e30874SSatish Balay   /* backward solve the upper triangular */
4028e30874SSatish Balay   ls = a->solve_work + A->cmap->n;
4128e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
4228e30874SSatish Balay     v  = aa + bs2 * (a->diag[i] + 1);
4328e30874SSatish Balay     vi = aj + a->diag[i] + 1;
4428e30874SSatish Balay     nz = ai[i + 1] - a->diag[i] - 1;
459566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(ls, t + i * bs, bs));
4628e30874SSatish Balay     while (nz--) {
4728e30874SSatish Balay       PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++));
4828e30874SSatish Balay       v += bs2;
4928e30874SSatish Balay     }
5028e30874SSatish Balay     PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs);
519566063dSJacob Faibussowitsch     PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs));
5228e30874SSatish Balay   }
5328e30874SSatish Balay 
549566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
559566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
569566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
579566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
589566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n));
59*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
6028e30874SSatish Balay }
6128e30874SSatish Balay 
62d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx)
63d71ae5a4SJacob Faibussowitsch {
6428e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
6528e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
6628e30874SSatish Balay   const PetscInt    *r, *c, *ai = a->i, *aj = a->j;
6728e30874SSatish Balay   const PetscInt    *rout, *cout, *diag = a->diag, *vi, n = a->mbs;
6828e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
6928e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
7028e30874SSatish Balay   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
7128e30874SSatish Balay   const PetscScalar *b;
7228e30874SSatish Balay 
7328e30874SSatish Balay   PetscFunctionBegin;
749566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
759566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
7628e30874SSatish Balay   t = a->solve_work;
7728e30874SSatish Balay 
789371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
799371c9d4SSatish Balay   r = rout;
809371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
819371c9d4SSatish Balay   c = cout + (n - 1);
8228e30874SSatish Balay 
8328e30874SSatish Balay   /* forward solve the lower triangular */
8428e30874SSatish Balay   idx  = 7 * (*r++);
859371c9d4SSatish Balay   t[0] = b[idx];
869371c9d4SSatish Balay   t[1] = b[1 + idx];
879371c9d4SSatish Balay   t[2] = b[2 + idx];
889371c9d4SSatish Balay   t[3] = b[3 + idx];
899371c9d4SSatish Balay   t[4] = b[4 + idx];
909371c9d4SSatish Balay   t[5] = b[5 + idx];
919371c9d4SSatish Balay   t[6] = b[6 + idx];
9228e30874SSatish Balay 
9328e30874SSatish Balay   for (i = 1; i < n; i++) {
9428e30874SSatish Balay     v   = aa + 49 * ai[i];
9528e30874SSatish Balay     vi  = aj + ai[i];
9628e30874SSatish Balay     nz  = diag[i] - ai[i];
9728e30874SSatish Balay     idx = 7 * (*r++);
989371c9d4SSatish Balay     s1  = b[idx];
999371c9d4SSatish Balay     s2  = b[1 + idx];
1009371c9d4SSatish Balay     s3  = b[2 + idx];
1019371c9d4SSatish Balay     s4  = b[3 + idx];
1029371c9d4SSatish Balay     s5  = b[4 + idx];
1039371c9d4SSatish Balay     s6  = b[5 + idx];
1049371c9d4SSatish Balay     s7  = b[6 + idx];
10528e30874SSatish Balay     while (nz--) {
10628e30874SSatish Balay       idx = 7 * (*vi++);
1079371c9d4SSatish Balay       x1  = t[idx];
1089371c9d4SSatish Balay       x2  = t[1 + idx];
1099371c9d4SSatish Balay       x3  = t[2 + idx];
1109371c9d4SSatish Balay       x4  = t[3 + idx];
1119371c9d4SSatish Balay       x5  = t[4 + idx];
1129371c9d4SSatish Balay       x6  = t[5 + idx];
1139371c9d4SSatish Balay       x7  = t[6 + idx];
11428e30874SSatish Balay       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
11528e30874SSatish Balay       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
11628e30874SSatish Balay       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
11728e30874SSatish Balay       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
11828e30874SSatish Balay       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
11928e30874SSatish Balay       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
12028e30874SSatish Balay       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
12128e30874SSatish Balay       v += 49;
12228e30874SSatish Balay     }
12328e30874SSatish Balay     idx        = 7 * i;
1249371c9d4SSatish Balay     t[idx]     = s1;
1259371c9d4SSatish Balay     t[1 + idx] = s2;
1269371c9d4SSatish Balay     t[2 + idx] = s3;
1279371c9d4SSatish Balay     t[3 + idx] = s4;
1289371c9d4SSatish Balay     t[4 + idx] = s5;
1299371c9d4SSatish Balay     t[5 + idx] = s6;
1309371c9d4SSatish Balay     t[6 + idx] = s7;
13128e30874SSatish Balay   }
13228e30874SSatish Balay   /* backward solve the upper triangular */
13328e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
13428e30874SSatish Balay     v   = aa + 49 * diag[i] + 49;
13528e30874SSatish Balay     vi  = aj + diag[i] + 1;
13628e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
13728e30874SSatish Balay     idt = 7 * i;
1389371c9d4SSatish Balay     s1  = t[idt];
1399371c9d4SSatish Balay     s2  = t[1 + idt];
1409371c9d4SSatish Balay     s3  = t[2 + idt];
1419371c9d4SSatish Balay     s4  = t[3 + idt];
1429371c9d4SSatish Balay     s5  = t[4 + idt];
1439371c9d4SSatish Balay     s6  = t[5 + idt];
1449371c9d4SSatish Balay     s7  = t[6 + idt];
14528e30874SSatish Balay     while (nz--) {
14628e30874SSatish Balay       idx = 7 * (*vi++);
1479371c9d4SSatish Balay       x1  = t[idx];
1489371c9d4SSatish Balay       x2  = t[1 + idx];
1499371c9d4SSatish Balay       x3  = t[2 + idx];
1509371c9d4SSatish Balay       x4  = t[3 + idx];
1519371c9d4SSatish Balay       x5  = t[4 + idx];
1529371c9d4SSatish Balay       x6  = t[5 + idx];
1539371c9d4SSatish Balay       x7  = t[6 + idx];
15428e30874SSatish Balay       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
15528e30874SSatish Balay       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
15628e30874SSatish Balay       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
15728e30874SSatish Balay       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
15828e30874SSatish Balay       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
15928e30874SSatish Balay       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
16028e30874SSatish Balay       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
16128e30874SSatish Balay       v += 49;
16228e30874SSatish Balay     }
16328e30874SSatish Balay     idc    = 7 * (*c--);
16428e30874SSatish Balay     v      = aa + 49 * diag[i];
1659371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
1669371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
1679371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
1689371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
1699371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
1709371c9d4SSatish Balay     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
1719371c9d4SSatish Balay     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
17228e30874SSatish Balay   }
17328e30874SSatish Balay 
1749566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
1759566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
1769566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
1779566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
1789566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
179*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
18028e30874SSatish Balay }
18128e30874SSatish Balay 
182d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx)
183d71ae5a4SJacob Faibussowitsch {
18428e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
18528e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
18628e30874SSatish Balay   const PetscInt    *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag;
18728e30874SSatish Balay   const PetscInt     n = a->mbs, *rout, *cout, *vi;
18828e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
18928e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
19028e30874SSatish Balay   PetscScalar        s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t;
19128e30874SSatish Balay   const PetscScalar *b;
19228e30874SSatish Balay 
19328e30874SSatish Balay   PetscFunctionBegin;
1949566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
1959566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
19628e30874SSatish Balay   t = a->solve_work;
19728e30874SSatish Balay 
1989371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
1999371c9d4SSatish Balay   r = rout;
2009371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
2019371c9d4SSatish Balay   c = cout;
20228e30874SSatish Balay 
20328e30874SSatish Balay   /* forward solve the lower triangular */
20428e30874SSatish Balay   idx  = 7 * r[0];
2059371c9d4SSatish Balay   t[0] = b[idx];
2069371c9d4SSatish Balay   t[1] = b[1 + idx];
2079371c9d4SSatish Balay   t[2] = b[2 + idx];
2089371c9d4SSatish Balay   t[3] = b[3 + idx];
2099371c9d4SSatish Balay   t[4] = b[4 + idx];
2109371c9d4SSatish Balay   t[5] = b[5 + idx];
2119371c9d4SSatish Balay   t[6] = b[6 + idx];
21228e30874SSatish Balay 
21328e30874SSatish Balay   for (i = 1; i < n; i++) {
21428e30874SSatish Balay     v   = aa + 49 * ai[i];
21528e30874SSatish Balay     vi  = aj + ai[i];
21628e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
21728e30874SSatish Balay     idx = 7 * r[i];
2189371c9d4SSatish Balay     s1  = b[idx];
2199371c9d4SSatish Balay     s2  = b[1 + idx];
2209371c9d4SSatish Balay     s3  = b[2 + idx];
2219371c9d4SSatish Balay     s4  = b[3 + idx];
2229371c9d4SSatish Balay     s5  = b[4 + idx];
2239371c9d4SSatish Balay     s6  = b[5 + idx];
2249371c9d4SSatish Balay     s7  = b[6 + idx];
22528e30874SSatish Balay     for (m = 0; m < nz; m++) {
22628e30874SSatish Balay       idx = 7 * vi[m];
2279371c9d4SSatish Balay       x1  = t[idx];
2289371c9d4SSatish Balay       x2  = t[1 + idx];
2299371c9d4SSatish Balay       x3  = t[2 + idx];
2309371c9d4SSatish Balay       x4  = t[3 + idx];
2319371c9d4SSatish Balay       x5  = t[4 + idx];
2329371c9d4SSatish Balay       x6  = t[5 + idx];
2339371c9d4SSatish Balay       x7  = t[6 + idx];
23428e30874SSatish Balay       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
23528e30874SSatish Balay       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
23628e30874SSatish Balay       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
23728e30874SSatish Balay       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
23828e30874SSatish Balay       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
23928e30874SSatish Balay       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
24028e30874SSatish Balay       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
24128e30874SSatish Balay       v += 49;
24228e30874SSatish Balay     }
24328e30874SSatish Balay     idx        = 7 * i;
2449371c9d4SSatish Balay     t[idx]     = s1;
2459371c9d4SSatish Balay     t[1 + idx] = s2;
2469371c9d4SSatish Balay     t[2 + idx] = s3;
2479371c9d4SSatish Balay     t[3 + idx] = s4;
2489371c9d4SSatish Balay     t[4 + idx] = s5;
2499371c9d4SSatish Balay     t[5 + idx] = s6;
2509371c9d4SSatish Balay     t[6 + idx] = s7;
25128e30874SSatish Balay   }
25228e30874SSatish Balay   /* backward solve the upper triangular */
25328e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
25428e30874SSatish Balay     v   = aa + 49 * (adiag[i + 1] + 1);
25528e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
25628e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
25728e30874SSatish Balay     idt = 7 * i;
2589371c9d4SSatish Balay     s1  = t[idt];
2599371c9d4SSatish Balay     s2  = t[1 + idt];
2609371c9d4SSatish Balay     s3  = t[2 + idt];
2619371c9d4SSatish Balay     s4  = t[3 + idt];
2629371c9d4SSatish Balay     s5  = t[4 + idt];
2639371c9d4SSatish Balay     s6  = t[5 + idt];
2649371c9d4SSatish Balay     s7  = t[6 + idt];
26528e30874SSatish Balay     for (m = 0; m < nz; m++) {
26628e30874SSatish Balay       idx = 7 * vi[m];
2679371c9d4SSatish Balay       x1  = t[idx];
2689371c9d4SSatish Balay       x2  = t[1 + idx];
2699371c9d4SSatish Balay       x3  = t[2 + idx];
2709371c9d4SSatish Balay       x4  = t[3 + idx];
2719371c9d4SSatish Balay       x5  = t[4 + idx];
2729371c9d4SSatish Balay       x6  = t[5 + idx];
2739371c9d4SSatish Balay       x7  = t[6 + idx];
27428e30874SSatish Balay       s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7;
27528e30874SSatish Balay       s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7;
27628e30874SSatish Balay       s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7;
27728e30874SSatish Balay       s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7;
27828e30874SSatish Balay       s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7;
27928e30874SSatish Balay       s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7;
28028e30874SSatish Balay       s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7;
28128e30874SSatish Balay       v += 49;
28228e30874SSatish Balay     }
28328e30874SSatish Balay     idc    = 7 * c[i];
2849371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7;
2859371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7;
2869371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7;
2879371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7;
2889371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7;
2899371c9d4SSatish Balay     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7;
2909371c9d4SSatish Balay     x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7;
29128e30874SSatish Balay   }
29228e30874SSatish Balay 
2939566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
2949566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
2959566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
2969566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
2979566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n));
298*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
29928e30874SSatish Balay }
30028e30874SSatish Balay 
301d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx)
302d71ae5a4SJacob Faibussowitsch {
30328e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
30428e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
30528e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
30628e30874SSatish Balay   const PetscInt    *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j;
30728e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
30828e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
30928e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
31028e30874SSatish Balay   const PetscScalar *b;
31128e30874SSatish Balay 
31228e30874SSatish Balay   PetscFunctionBegin;
3139566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
3149566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
31528e30874SSatish Balay   t = a->solve_work;
31628e30874SSatish Balay 
3179371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
3189371c9d4SSatish Balay   r = rout;
3199371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
3209371c9d4SSatish Balay   c = cout + (n - 1);
32128e30874SSatish Balay 
32228e30874SSatish Balay   /* forward solve the lower triangular */
32328e30874SSatish Balay   idx  = 6 * (*r++);
3249371c9d4SSatish Balay   t[0] = b[idx];
3259371c9d4SSatish Balay   t[1] = b[1 + idx];
3269371c9d4SSatish Balay   t[2] = b[2 + idx];
3279371c9d4SSatish Balay   t[3] = b[3 + idx];
3289371c9d4SSatish Balay   t[4] = b[4 + idx];
3299371c9d4SSatish Balay   t[5] = b[5 + idx];
33028e30874SSatish Balay   for (i = 1; i < n; i++) {
33128e30874SSatish Balay     v   = aa + 36 * ai[i];
33228e30874SSatish Balay     vi  = aj + ai[i];
33328e30874SSatish Balay     nz  = diag[i] - ai[i];
33428e30874SSatish Balay     idx = 6 * (*r++);
3359371c9d4SSatish Balay     s1  = b[idx];
3369371c9d4SSatish Balay     s2  = b[1 + idx];
3379371c9d4SSatish Balay     s3  = b[2 + idx];
3389371c9d4SSatish Balay     s4  = b[3 + idx];
3399371c9d4SSatish Balay     s5  = b[4 + idx];
3409371c9d4SSatish Balay     s6  = b[5 + idx];
34128e30874SSatish Balay     while (nz--) {
34228e30874SSatish Balay       idx = 6 * (*vi++);
3439371c9d4SSatish Balay       x1  = t[idx];
3449371c9d4SSatish Balay       x2  = t[1 + idx];
3459371c9d4SSatish Balay       x3  = t[2 + idx];
3469371c9d4SSatish Balay       x4  = t[3 + idx];
3479371c9d4SSatish Balay       x5  = t[4 + idx];
3489371c9d4SSatish Balay       x6  = t[5 + idx];
34928e30874SSatish Balay       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
35028e30874SSatish Balay       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
35128e30874SSatish Balay       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
35228e30874SSatish Balay       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
35328e30874SSatish Balay       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
35428e30874SSatish Balay       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
35528e30874SSatish Balay       v += 36;
35628e30874SSatish Balay     }
35728e30874SSatish Balay     idx        = 6 * i;
3589371c9d4SSatish Balay     t[idx]     = s1;
3599371c9d4SSatish Balay     t[1 + idx] = s2;
3609371c9d4SSatish Balay     t[2 + idx] = s3;
3619371c9d4SSatish Balay     t[3 + idx] = s4;
3629371c9d4SSatish Balay     t[4 + idx] = s5;
3639371c9d4SSatish Balay     t[5 + idx] = s6;
36428e30874SSatish Balay   }
36528e30874SSatish Balay   /* backward solve the upper triangular */
36628e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
36728e30874SSatish Balay     v   = aa + 36 * diag[i] + 36;
36828e30874SSatish Balay     vi  = aj + diag[i] + 1;
36928e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
37028e30874SSatish Balay     idt = 6 * i;
3719371c9d4SSatish Balay     s1  = t[idt];
3729371c9d4SSatish Balay     s2  = t[1 + idt];
3739371c9d4SSatish Balay     s3  = t[2 + idt];
3749371c9d4SSatish Balay     s4  = t[3 + idt];
3759371c9d4SSatish Balay     s5  = t[4 + idt];
3769371c9d4SSatish Balay     s6  = t[5 + idt];
37728e30874SSatish Balay     while (nz--) {
37828e30874SSatish Balay       idx = 6 * (*vi++);
3799371c9d4SSatish Balay       x1  = t[idx];
3809371c9d4SSatish Balay       x2  = t[1 + idx];
3819371c9d4SSatish Balay       x3  = t[2 + idx];
3829371c9d4SSatish Balay       x4  = t[3 + idx];
3839371c9d4SSatish Balay       x5  = t[4 + idx];
3849371c9d4SSatish Balay       x6  = t[5 + idx];
38528e30874SSatish Balay       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
38628e30874SSatish Balay       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
38728e30874SSatish Balay       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
38828e30874SSatish Balay       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
38928e30874SSatish Balay       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
39028e30874SSatish Balay       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
39128e30874SSatish Balay       v += 36;
39228e30874SSatish Balay     }
39328e30874SSatish Balay     idc    = 6 * (*c--);
39428e30874SSatish Balay     v      = aa + 36 * diag[i];
3959371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
3969371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
3979371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
3989371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
3999371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
4009371c9d4SSatish Balay     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
40128e30874SSatish Balay   }
40228e30874SSatish Balay 
4039566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
4049566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
4059566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
4069566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
4079566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
408*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
40928e30874SSatish Balay }
41028e30874SSatish Balay 
411d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx)
412d71ae5a4SJacob Faibussowitsch {
41328e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
41428e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
41528e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
41628e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
41728e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
41828e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
41928e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t;
42028e30874SSatish Balay   const PetscScalar *b;
42128e30874SSatish Balay 
42228e30874SSatish Balay   PetscFunctionBegin;
4239566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
4249566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
42528e30874SSatish Balay   t = a->solve_work;
42628e30874SSatish Balay 
4279371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
4289371c9d4SSatish Balay   r = rout;
4299371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
4309371c9d4SSatish Balay   c = cout;
43128e30874SSatish Balay 
43228e30874SSatish Balay   /* forward solve the lower triangular */
43328e30874SSatish Balay   idx  = 6 * r[0];
4349371c9d4SSatish Balay   t[0] = b[idx];
4359371c9d4SSatish Balay   t[1] = b[1 + idx];
4369371c9d4SSatish Balay   t[2] = b[2 + idx];
4379371c9d4SSatish Balay   t[3] = b[3 + idx];
4389371c9d4SSatish Balay   t[4] = b[4 + idx];
4399371c9d4SSatish Balay   t[5] = b[5 + idx];
44028e30874SSatish Balay   for (i = 1; i < n; i++) {
44128e30874SSatish Balay     v   = aa + 36 * ai[i];
44228e30874SSatish Balay     vi  = aj + ai[i];
44328e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
44428e30874SSatish Balay     idx = 6 * r[i];
4459371c9d4SSatish Balay     s1  = b[idx];
4469371c9d4SSatish Balay     s2  = b[1 + idx];
4479371c9d4SSatish Balay     s3  = b[2 + idx];
4489371c9d4SSatish Balay     s4  = b[3 + idx];
4499371c9d4SSatish Balay     s5  = b[4 + idx];
4509371c9d4SSatish Balay     s6  = b[5 + idx];
45128e30874SSatish Balay     for (m = 0; m < nz; m++) {
45228e30874SSatish Balay       idx = 6 * vi[m];
4539371c9d4SSatish Balay       x1  = t[idx];
4549371c9d4SSatish Balay       x2  = t[1 + idx];
4559371c9d4SSatish Balay       x3  = t[2 + idx];
4569371c9d4SSatish Balay       x4  = t[3 + idx];
4579371c9d4SSatish Balay       x5  = t[4 + idx];
4589371c9d4SSatish Balay       x6  = t[5 + idx];
45928e30874SSatish Balay       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
46028e30874SSatish Balay       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
46128e30874SSatish Balay       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
46228e30874SSatish Balay       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
46328e30874SSatish Balay       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
46428e30874SSatish Balay       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
46528e30874SSatish Balay       v += 36;
46628e30874SSatish Balay     }
46728e30874SSatish Balay     idx        = 6 * i;
4689371c9d4SSatish Balay     t[idx]     = s1;
4699371c9d4SSatish Balay     t[1 + idx] = s2;
4709371c9d4SSatish Balay     t[2 + idx] = s3;
4719371c9d4SSatish Balay     t[3 + idx] = s4;
4729371c9d4SSatish Balay     t[4 + idx] = s5;
4739371c9d4SSatish Balay     t[5 + idx] = s6;
47428e30874SSatish Balay   }
47528e30874SSatish Balay   /* backward solve the upper triangular */
47628e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
47728e30874SSatish Balay     v   = aa + 36 * (adiag[i + 1] + 1);
47828e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
47928e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
48028e30874SSatish Balay     idt = 6 * i;
4819371c9d4SSatish Balay     s1  = t[idt];
4829371c9d4SSatish Balay     s2  = t[1 + idt];
4839371c9d4SSatish Balay     s3  = t[2 + idt];
4849371c9d4SSatish Balay     s4  = t[3 + idt];
4859371c9d4SSatish Balay     s5  = t[4 + idt];
4869371c9d4SSatish Balay     s6  = t[5 + idt];
48728e30874SSatish Balay     for (m = 0; m < nz; m++) {
48828e30874SSatish Balay       idx = 6 * vi[m];
4899371c9d4SSatish Balay       x1  = t[idx];
4909371c9d4SSatish Balay       x2  = t[1 + idx];
4919371c9d4SSatish Balay       x3  = t[2 + idx];
4929371c9d4SSatish Balay       x4  = t[3 + idx];
4939371c9d4SSatish Balay       x5  = t[4 + idx];
4949371c9d4SSatish Balay       x6  = t[5 + idx];
49528e30874SSatish Balay       s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6;
49628e30874SSatish Balay       s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6;
49728e30874SSatish Balay       s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6;
49828e30874SSatish Balay       s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6;
49928e30874SSatish Balay       s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6;
50028e30874SSatish Balay       s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6;
50128e30874SSatish Balay       v += 36;
50228e30874SSatish Balay     }
50328e30874SSatish Balay     idc    = 6 * c[i];
5049371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6;
5059371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6;
5069371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6;
5079371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6;
5089371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6;
5099371c9d4SSatish Balay     x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6;
51028e30874SSatish Balay   }
51128e30874SSatish Balay 
5129566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
5139566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
5149566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
5159566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
5169566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n));
517*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
51828e30874SSatish Balay }
51928e30874SSatish Balay 
520d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx)
521d71ae5a4SJacob Faibussowitsch {
52228e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
52328e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
52428e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout, *diag = a->diag;
52528e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
52628e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
52728e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
52828e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
52928e30874SSatish Balay   const PetscScalar *b;
53028e30874SSatish Balay 
53128e30874SSatish Balay   PetscFunctionBegin;
5329566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
5339566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
53428e30874SSatish Balay   t = a->solve_work;
53528e30874SSatish Balay 
5369371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
5379371c9d4SSatish Balay   r = rout;
5389371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
5399371c9d4SSatish Balay   c = cout + (n - 1);
54028e30874SSatish Balay 
54128e30874SSatish Balay   /* forward solve the lower triangular */
54228e30874SSatish Balay   idx  = 5 * (*r++);
5439371c9d4SSatish Balay   t[0] = b[idx];
5449371c9d4SSatish Balay   t[1] = b[1 + idx];
5459371c9d4SSatish Balay   t[2] = b[2 + idx];
5469371c9d4SSatish Balay   t[3] = b[3 + idx];
5479371c9d4SSatish Balay   t[4] = b[4 + idx];
54828e30874SSatish Balay   for (i = 1; i < n; i++) {
54928e30874SSatish Balay     v   = aa + 25 * ai[i];
55028e30874SSatish Balay     vi  = aj + ai[i];
55128e30874SSatish Balay     nz  = diag[i] - ai[i];
55228e30874SSatish Balay     idx = 5 * (*r++);
5539371c9d4SSatish Balay     s1  = b[idx];
5549371c9d4SSatish Balay     s2  = b[1 + idx];
5559371c9d4SSatish Balay     s3  = b[2 + idx];
5569371c9d4SSatish Balay     s4  = b[3 + idx];
55728e30874SSatish Balay     s5  = b[4 + idx];
55828e30874SSatish Balay     while (nz--) {
55928e30874SSatish Balay       idx = 5 * (*vi++);
5609371c9d4SSatish Balay       x1  = t[idx];
5619371c9d4SSatish Balay       x2  = t[1 + idx];
5629371c9d4SSatish Balay       x3  = t[2 + idx];
5639371c9d4SSatish Balay       x4  = t[3 + idx];
5649371c9d4SSatish Balay       x5  = t[4 + idx];
56528e30874SSatish Balay       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
56628e30874SSatish Balay       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
56728e30874SSatish Balay       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
56828e30874SSatish Balay       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
56928e30874SSatish Balay       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
57028e30874SSatish Balay       v += 25;
57128e30874SSatish Balay     }
57228e30874SSatish Balay     idx        = 5 * i;
5739371c9d4SSatish Balay     t[idx]     = s1;
5749371c9d4SSatish Balay     t[1 + idx] = s2;
5759371c9d4SSatish Balay     t[2 + idx] = s3;
5769371c9d4SSatish Balay     t[3 + idx] = s4;
5779371c9d4SSatish Balay     t[4 + idx] = s5;
57828e30874SSatish Balay   }
57928e30874SSatish Balay   /* backward solve the upper triangular */
58028e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
58128e30874SSatish Balay     v   = aa + 25 * diag[i] + 25;
58228e30874SSatish Balay     vi  = aj + diag[i] + 1;
58328e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
58428e30874SSatish Balay     idt = 5 * i;
5859371c9d4SSatish Balay     s1  = t[idt];
5869371c9d4SSatish Balay     s2  = t[1 + idt];
5879371c9d4SSatish Balay     s3  = t[2 + idt];
5889371c9d4SSatish Balay     s4  = t[3 + idt];
5899371c9d4SSatish Balay     s5  = t[4 + idt];
59028e30874SSatish Balay     while (nz--) {
59128e30874SSatish Balay       idx = 5 * (*vi++);
5929371c9d4SSatish Balay       x1  = t[idx];
5939371c9d4SSatish Balay       x2  = t[1 + idx];
5949371c9d4SSatish Balay       x3  = t[2 + idx];
5959371c9d4SSatish Balay       x4  = t[3 + idx];
5969371c9d4SSatish Balay       x5  = t[4 + idx];
59728e30874SSatish Balay       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
59828e30874SSatish Balay       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
59928e30874SSatish Balay       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
60028e30874SSatish Balay       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
60128e30874SSatish Balay       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
60228e30874SSatish Balay       v += 25;
60328e30874SSatish Balay     }
60428e30874SSatish Balay     idc    = 5 * (*c--);
60528e30874SSatish Balay     v      = aa + 25 * diag[i];
6069371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
6079371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
6089371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
6099371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
6109371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
61128e30874SSatish Balay   }
61228e30874SSatish Balay 
6139566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
6149566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
6159566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
6169566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
6179566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
618*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
61928e30874SSatish Balay }
62028e30874SSatish Balay 
621d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx)
622d71ae5a4SJacob Faibussowitsch {
62328e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
62428e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
62528e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
62628e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
62728e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
62828e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
62928e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t;
63028e30874SSatish Balay   const PetscScalar *b;
63128e30874SSatish Balay 
63228e30874SSatish Balay   PetscFunctionBegin;
6339566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
6349566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
63528e30874SSatish Balay   t = a->solve_work;
63628e30874SSatish Balay 
6379371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
6389371c9d4SSatish Balay   r = rout;
6399371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
6409371c9d4SSatish Balay   c = cout;
64128e30874SSatish Balay 
64228e30874SSatish Balay   /* forward solve the lower triangular */
64328e30874SSatish Balay   idx  = 5 * r[0];
6449371c9d4SSatish Balay   t[0] = b[idx];
6459371c9d4SSatish Balay   t[1] = b[1 + idx];
6469371c9d4SSatish Balay   t[2] = b[2 + idx];
6479371c9d4SSatish Balay   t[3] = b[3 + idx];
6489371c9d4SSatish Balay   t[4] = b[4 + idx];
64928e30874SSatish Balay   for (i = 1; i < n; i++) {
65028e30874SSatish Balay     v   = aa + 25 * ai[i];
65128e30874SSatish Balay     vi  = aj + ai[i];
65228e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
65328e30874SSatish Balay     idx = 5 * r[i];
6549371c9d4SSatish Balay     s1  = b[idx];
6559371c9d4SSatish Balay     s2  = b[1 + idx];
6569371c9d4SSatish Balay     s3  = b[2 + idx];
6579371c9d4SSatish Balay     s4  = b[3 + idx];
65828e30874SSatish Balay     s5  = b[4 + idx];
65928e30874SSatish Balay     for (m = 0; m < nz; m++) {
66028e30874SSatish Balay       idx = 5 * vi[m];
6619371c9d4SSatish Balay       x1  = t[idx];
6629371c9d4SSatish Balay       x2  = t[1 + idx];
6639371c9d4SSatish Balay       x3  = t[2 + idx];
6649371c9d4SSatish Balay       x4  = t[3 + idx];
6659371c9d4SSatish Balay       x5  = t[4 + idx];
66628e30874SSatish Balay       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
66728e30874SSatish Balay       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
66828e30874SSatish Balay       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
66928e30874SSatish Balay       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
67028e30874SSatish Balay       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
67128e30874SSatish Balay       v += 25;
67228e30874SSatish Balay     }
67328e30874SSatish Balay     idx        = 5 * i;
6749371c9d4SSatish Balay     t[idx]     = s1;
6759371c9d4SSatish Balay     t[1 + idx] = s2;
6769371c9d4SSatish Balay     t[2 + idx] = s3;
6779371c9d4SSatish Balay     t[3 + idx] = s4;
6789371c9d4SSatish Balay     t[4 + idx] = s5;
67928e30874SSatish Balay   }
68028e30874SSatish Balay   /* backward solve the upper triangular */
68128e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
68228e30874SSatish Balay     v   = aa + 25 * (adiag[i + 1] + 1);
68328e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
68428e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
68528e30874SSatish Balay     idt = 5 * i;
6869371c9d4SSatish Balay     s1  = t[idt];
6879371c9d4SSatish Balay     s2  = t[1 + idt];
6889371c9d4SSatish Balay     s3  = t[2 + idt];
6899371c9d4SSatish Balay     s4  = t[3 + idt];
6909371c9d4SSatish Balay     s5  = t[4 + idt];
69128e30874SSatish Balay     for (m = 0; m < nz; m++) {
69228e30874SSatish Balay       idx = 5 * vi[m];
6939371c9d4SSatish Balay       x1  = t[idx];
6949371c9d4SSatish Balay       x2  = t[1 + idx];
6959371c9d4SSatish Balay       x3  = t[2 + idx];
6969371c9d4SSatish Balay       x4  = t[3 + idx];
6979371c9d4SSatish Balay       x5  = t[4 + idx];
69828e30874SSatish Balay       s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5;
69928e30874SSatish Balay       s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5;
70028e30874SSatish Balay       s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5;
70128e30874SSatish Balay       s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5;
70228e30874SSatish Balay       s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5;
70328e30874SSatish Balay       v += 25;
70428e30874SSatish Balay     }
70528e30874SSatish Balay     idc    = 5 * c[i];
7069371c9d4SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5;
7079371c9d4SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5;
7089371c9d4SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5;
7099371c9d4SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5;
7109371c9d4SSatish Balay     x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5;
71128e30874SSatish Balay   }
71228e30874SSatish Balay 
7139566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
7149566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
7159566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
7169566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
7179566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n));
718*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
71928e30874SSatish Balay }
72028e30874SSatish Balay 
721d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx)
722d71ae5a4SJacob Faibussowitsch {
72328e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
72428e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
72528e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
72628e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
72728e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
72828e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
72928e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
73028e30874SSatish Balay   const PetscScalar *b;
73128e30874SSatish Balay 
73228e30874SSatish Balay   PetscFunctionBegin;
7339566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
7349566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
73528e30874SSatish Balay   t = a->solve_work;
73628e30874SSatish Balay 
7379371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
7389371c9d4SSatish Balay   r = rout;
7399371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
7409371c9d4SSatish Balay   c = cout + (n - 1);
74128e30874SSatish Balay 
74228e30874SSatish Balay   /* forward solve the lower triangular */
74328e30874SSatish Balay   idx  = 4 * (*r++);
7449371c9d4SSatish Balay   t[0] = b[idx];
7459371c9d4SSatish Balay   t[1] = b[1 + idx];
7469371c9d4SSatish Balay   t[2] = b[2 + idx];
7479371c9d4SSatish Balay   t[3] = b[3 + idx];
74828e30874SSatish Balay   for (i = 1; i < n; i++) {
74928e30874SSatish Balay     v   = aa + 16 * ai[i];
75028e30874SSatish Balay     vi  = aj + ai[i];
75128e30874SSatish Balay     nz  = diag[i] - ai[i];
75228e30874SSatish Balay     idx = 4 * (*r++);
7539371c9d4SSatish Balay     s1  = b[idx];
7549371c9d4SSatish Balay     s2  = b[1 + idx];
7559371c9d4SSatish Balay     s3  = b[2 + idx];
7569371c9d4SSatish Balay     s4  = b[3 + idx];
75728e30874SSatish Balay     while (nz--) {
75828e30874SSatish Balay       idx = 4 * (*vi++);
7599371c9d4SSatish Balay       x1  = t[idx];
7609371c9d4SSatish Balay       x2  = t[1 + idx];
7619371c9d4SSatish Balay       x3  = t[2 + idx];
7629371c9d4SSatish Balay       x4  = t[3 + idx];
76328e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
76428e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
76528e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
76628e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
76728e30874SSatish Balay       v += 16;
76828e30874SSatish Balay     }
76928e30874SSatish Balay     idx        = 4 * i;
7709371c9d4SSatish Balay     t[idx]     = s1;
7719371c9d4SSatish Balay     t[1 + idx] = s2;
7729371c9d4SSatish Balay     t[2 + idx] = s3;
7739371c9d4SSatish Balay     t[3 + idx] = s4;
77428e30874SSatish Balay   }
77528e30874SSatish Balay   /* backward solve the upper triangular */
77628e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
77728e30874SSatish Balay     v   = aa + 16 * diag[i] + 16;
77828e30874SSatish Balay     vi  = aj + diag[i] + 1;
77928e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
78028e30874SSatish Balay     idt = 4 * i;
7819371c9d4SSatish Balay     s1  = t[idt];
7829371c9d4SSatish Balay     s2  = t[1 + idt];
7839371c9d4SSatish Balay     s3  = t[2 + idt];
7849371c9d4SSatish Balay     s4  = t[3 + idt];
78528e30874SSatish Balay     while (nz--) {
78628e30874SSatish Balay       idx = 4 * (*vi++);
7879371c9d4SSatish Balay       x1  = t[idx];
7889371c9d4SSatish Balay       x2  = t[1 + idx];
7899371c9d4SSatish Balay       x3  = t[2 + idx];
7909371c9d4SSatish Balay       x4  = t[3 + idx];
79128e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
79228e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
79328e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
79428e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
79528e30874SSatish Balay       v += 16;
79628e30874SSatish Balay     }
79728e30874SSatish Balay     idc    = 4 * (*c--);
79828e30874SSatish Balay     v      = aa + 16 * diag[i];
79928e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
80028e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
80128e30874SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
80228e30874SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
80328e30874SSatish Balay   }
80428e30874SSatish Balay 
8059566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
8069566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
8079566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
8089566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
8099566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
810*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
81128e30874SSatish Balay }
81228e30874SSatish Balay 
813d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx)
814d71ae5a4SJacob Faibussowitsch {
81528e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
81628e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
81728e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
81828e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
81928e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
82028e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
82128e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, s4, x1, x2, x3, x4, *t;
82228e30874SSatish Balay   const PetscScalar *b;
82328e30874SSatish Balay 
82428e30874SSatish Balay   PetscFunctionBegin;
8259566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
8269566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
82728e30874SSatish Balay   t = a->solve_work;
82828e30874SSatish Balay 
8299371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
8309371c9d4SSatish Balay   r = rout;
8319371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
8329371c9d4SSatish Balay   c = cout;
83328e30874SSatish Balay 
83428e30874SSatish Balay   /* forward solve the lower triangular */
83528e30874SSatish Balay   idx  = 4 * r[0];
8369371c9d4SSatish Balay   t[0] = b[idx];
8379371c9d4SSatish Balay   t[1] = b[1 + idx];
8389371c9d4SSatish Balay   t[2] = b[2 + idx];
8399371c9d4SSatish Balay   t[3] = b[3 + idx];
84028e30874SSatish Balay   for (i = 1; i < n; i++) {
84128e30874SSatish Balay     v   = aa + 16 * ai[i];
84228e30874SSatish Balay     vi  = aj + ai[i];
84328e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
84428e30874SSatish Balay     idx = 4 * r[i];
8459371c9d4SSatish Balay     s1  = b[idx];
8469371c9d4SSatish Balay     s2  = b[1 + idx];
8479371c9d4SSatish Balay     s3  = b[2 + idx];
8489371c9d4SSatish Balay     s4  = b[3 + idx];
84928e30874SSatish Balay     for (m = 0; m < nz; m++) {
85028e30874SSatish Balay       idx = 4 * vi[m];
8519371c9d4SSatish Balay       x1  = t[idx];
8529371c9d4SSatish Balay       x2  = t[1 + idx];
8539371c9d4SSatish Balay       x3  = t[2 + idx];
8549371c9d4SSatish Balay       x4  = t[3 + idx];
85528e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
85628e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
85728e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
85828e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
85928e30874SSatish Balay       v += 16;
86028e30874SSatish Balay     }
86128e30874SSatish Balay     idx        = 4 * i;
8629371c9d4SSatish Balay     t[idx]     = s1;
8639371c9d4SSatish Balay     t[1 + idx] = s2;
8649371c9d4SSatish Balay     t[2 + idx] = s3;
8659371c9d4SSatish Balay     t[3 + idx] = s4;
86628e30874SSatish Balay   }
86728e30874SSatish Balay   /* backward solve the upper triangular */
86828e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
86928e30874SSatish Balay     v   = aa + 16 * (adiag[i + 1] + 1);
87028e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
87128e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
87228e30874SSatish Balay     idt = 4 * i;
8739371c9d4SSatish Balay     s1  = t[idt];
8749371c9d4SSatish Balay     s2  = t[1 + idt];
8759371c9d4SSatish Balay     s3  = t[2 + idt];
8769371c9d4SSatish Balay     s4  = t[3 + idt];
87728e30874SSatish Balay     for (m = 0; m < nz; m++) {
87828e30874SSatish Balay       idx = 4 * vi[m];
8799371c9d4SSatish Balay       x1  = t[idx];
8809371c9d4SSatish Balay       x2  = t[1 + idx];
8819371c9d4SSatish Balay       x3  = t[2 + idx];
8829371c9d4SSatish Balay       x4  = t[3 + idx];
88328e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
88428e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
88528e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
88628e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
88728e30874SSatish Balay       v += 16;
88828e30874SSatish Balay     }
88928e30874SSatish Balay     idc    = 4 * c[i];
89028e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
89128e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
89228e30874SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
89328e30874SSatish Balay     x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
89428e30874SSatish Balay   }
89528e30874SSatish Balay 
8969566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
8979566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
8989566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
8999566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
9009566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
901*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
90228e30874SSatish Balay }
90328e30874SSatish Balay 
904d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx)
905d71ae5a4SJacob Faibussowitsch {
90628e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
90728e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
90828e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
90928e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
91028e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
91128e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
91228e30874SSatish Balay   MatScalar          s1, s2, s3, s4, x1, x2, x3, x4, *t;
91328e30874SSatish Balay   PetscScalar       *x;
91428e30874SSatish Balay   const PetscScalar *b;
91528e30874SSatish Balay 
91628e30874SSatish Balay   PetscFunctionBegin;
9179566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
9189566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
91928e30874SSatish Balay   t = (MatScalar *)a->solve_work;
92028e30874SSatish Balay 
9219371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
9229371c9d4SSatish Balay   r = rout;
9239371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
9249371c9d4SSatish Balay   c = cout + (n - 1);
92528e30874SSatish Balay 
92628e30874SSatish Balay   /* forward solve the lower triangular */
92728e30874SSatish Balay   idx  = 4 * (*r++);
92828e30874SSatish Balay   t[0] = (MatScalar)b[idx];
92928e30874SSatish Balay   t[1] = (MatScalar)b[1 + idx];
93028e30874SSatish Balay   t[2] = (MatScalar)b[2 + idx];
93128e30874SSatish Balay   t[3] = (MatScalar)b[3 + idx];
93228e30874SSatish Balay   for (i = 1; i < n; i++) {
93328e30874SSatish Balay     v   = aa + 16 * ai[i];
93428e30874SSatish Balay     vi  = aj + ai[i];
93528e30874SSatish Balay     nz  = diag[i] - ai[i];
93628e30874SSatish Balay     idx = 4 * (*r++);
93728e30874SSatish Balay     s1  = (MatScalar)b[idx];
93828e30874SSatish Balay     s2  = (MatScalar)b[1 + idx];
93928e30874SSatish Balay     s3  = (MatScalar)b[2 + idx];
94028e30874SSatish Balay     s4  = (MatScalar)b[3 + idx];
94128e30874SSatish Balay     while (nz--) {
94228e30874SSatish Balay       idx = 4 * (*vi++);
94328e30874SSatish Balay       x1  = t[idx];
94428e30874SSatish Balay       x2  = t[1 + idx];
94528e30874SSatish Balay       x3  = t[2 + idx];
94628e30874SSatish Balay       x4  = t[3 + idx];
94728e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
94828e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
94928e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
95028e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
95128e30874SSatish Balay       v += 16;
95228e30874SSatish Balay     }
95328e30874SSatish Balay     idx        = 4 * i;
95428e30874SSatish Balay     t[idx]     = s1;
95528e30874SSatish Balay     t[1 + idx] = s2;
95628e30874SSatish Balay     t[2 + idx] = s3;
95728e30874SSatish Balay     t[3 + idx] = s4;
95828e30874SSatish Balay   }
95928e30874SSatish Balay   /* backward solve the upper triangular */
96028e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
96128e30874SSatish Balay     v   = aa + 16 * diag[i] + 16;
96228e30874SSatish Balay     vi  = aj + diag[i] + 1;
96328e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
96428e30874SSatish Balay     idt = 4 * i;
96528e30874SSatish Balay     s1  = t[idt];
96628e30874SSatish Balay     s2  = t[1 + idt];
96728e30874SSatish Balay     s3  = t[2 + idt];
96828e30874SSatish Balay     s4  = t[3 + idt];
96928e30874SSatish Balay     while (nz--) {
97028e30874SSatish Balay       idx = 4 * (*vi++);
97128e30874SSatish Balay       x1  = t[idx];
97228e30874SSatish Balay       x2  = t[1 + idx];
97328e30874SSatish Balay       x3  = t[2 + idx];
97428e30874SSatish Balay       x4  = t[3 + idx];
97528e30874SSatish Balay       s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4;
97628e30874SSatish Balay       s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4;
97728e30874SSatish Balay       s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4;
97828e30874SSatish Balay       s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4;
97928e30874SSatish Balay       v += 16;
98028e30874SSatish Balay     }
98128e30874SSatish Balay     idc        = 4 * (*c--);
98228e30874SSatish Balay     v          = aa + 16 * diag[i];
98328e30874SSatish Balay     t[idt]     = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4;
98428e30874SSatish Balay     t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4;
98528e30874SSatish Balay     t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4;
98628e30874SSatish Balay     t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4;
98728e30874SSatish Balay     x[idc]     = (PetscScalar)t[idt];
98828e30874SSatish Balay     x[1 + idc] = (PetscScalar)t[1 + idt];
98928e30874SSatish Balay     x[2 + idc] = (PetscScalar)t[2 + idt];
99028e30874SSatish Balay     x[3 + idc] = (PetscScalar)t[3 + idt];
99128e30874SSatish Balay   }
99228e30874SSatish Balay 
9939566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
9949566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
9959566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
9969566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
9979566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
998*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
99928e30874SSatish Balay }
100028e30874SSatish Balay 
100128e30874SSatish Balay #if defined(PETSC_HAVE_SSE)
100228e30874SSatish Balay 
100328e30874SSatish Balay   #include PETSC_HAVE_SSE
100428e30874SSatish Balay 
1005d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx)
1006d71ae5a4SJacob Faibussowitsch {
100728e30874SSatish Balay   /*
100828e30874SSatish Balay      Note: This code uses demotion of double
100928e30874SSatish Balay      to float when performing the mixed-mode computation.
101028e30874SSatish Balay      This may not be numerically reasonable for all applications.
101128e30874SSatish Balay   */
101228e30874SSatish Balay   Mat_SeqBAIJ    *a     = (Mat_SeqBAIJ *)A->data;
101328e30874SSatish Balay   IS              iscol = a->col, isrow = a->row;
101428e30874SSatish Balay   PetscInt        i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16;
101528e30874SSatish Balay   const PetscInt *r, *c, *diag = a->diag, *rout, *cout;
101628e30874SSatish Balay   MatScalar      *aa = a->a, *v;
101728e30874SSatish Balay   PetscScalar    *x, *b, *t;
101828e30874SSatish Balay 
101928e30874SSatish Balay   /* Make space in temp stack for 16 Byte Aligned arrays */
102028e30874SSatish Balay   float         ssealignedspace[11], *tmps, *tmpx;
102128e30874SSatish Balay   unsigned long offset;
102228e30874SSatish Balay 
102328e30874SSatish Balay   PetscFunctionBegin;
102428e30874SSatish Balay   SSE_SCOPE_BEGIN;
102528e30874SSatish Balay 
102628e30874SSatish Balay   offset = (unsigned long)ssealignedspace % 16;
102728e30874SSatish Balay   if (offset) offset = (16 - offset) / 4;
102828e30874SSatish Balay   tmps = &ssealignedspace[offset];
102928e30874SSatish Balay   tmpx = &ssealignedspace[offset + 4];
103028e30874SSatish Balay   PREFETCH_NTA(aa + 16 * ai[1]);
103128e30874SSatish Balay 
10329566063dSJacob Faibussowitsch   PetscCall(VecGetArray(bb, &b));
10339566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
103428e30874SSatish Balay   t = a->solve_work;
103528e30874SSatish Balay 
10369371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
10379371c9d4SSatish Balay   r = rout;
10389371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
10399371c9d4SSatish Balay   c = cout + (n - 1);
104028e30874SSatish Balay 
104128e30874SSatish Balay   /* forward solve the lower triangular */
104228e30874SSatish Balay   idx  = 4 * (*r++);
10439371c9d4SSatish Balay   t[0] = b[idx];
10449371c9d4SSatish Balay   t[1] = b[1 + idx];
10459371c9d4SSatish Balay   t[2] = b[2 + idx];
10469371c9d4SSatish Balay   t[3] = b[3 + idx];
104728e30874SSatish Balay   v    = aa + 16 * ai[1];
104828e30874SSatish Balay 
104928e30874SSatish Balay   for (i = 1; i < n;) {
105028e30874SSatish Balay     PREFETCH_NTA(&v[8]);
105128e30874SSatish Balay     vi  = aj + ai[i];
105228e30874SSatish Balay     nz  = diag[i] - ai[i];
105328e30874SSatish Balay     idx = 4 * (*r++);
105428e30874SSatish Balay 
105528e30874SSatish Balay     /* Demote sum from double to float */
105628e30874SSatish Balay     CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]);
105728e30874SSatish Balay     LOAD_PS(tmps, XMM7);
105828e30874SSatish Balay 
105928e30874SSatish Balay     while (nz--) {
106028e30874SSatish Balay       PREFETCH_NTA(&v[16]);
106128e30874SSatish Balay       idx = 4 * (*vi++);
106228e30874SSatish Balay 
106328e30874SSatish Balay       /* Demote solution (so far) from double to float */
106428e30874SSatish Balay       CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]);
106528e30874SSatish Balay 
106628e30874SSatish Balay       /* 4x4 Matrix-Vector product with negative accumulation: */
106728e30874SSatish Balay       SSE_INLINE_BEGIN_2(tmpx, v)
106828e30874SSatish Balay       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
106928e30874SSatish Balay 
107028e30874SSatish Balay       /* First Column */
107128e30874SSatish Balay       SSE_COPY_PS(XMM0, XMM6)
107228e30874SSatish Balay       SSE_SHUFFLE(XMM0, XMM0, 0x00)
107328e30874SSatish Balay       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
107428e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM0)
107528e30874SSatish Balay 
107628e30874SSatish Balay       /* Second Column */
107728e30874SSatish Balay       SSE_COPY_PS(XMM1, XMM6)
107828e30874SSatish Balay       SSE_SHUFFLE(XMM1, XMM1, 0x55)
107928e30874SSatish Balay       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
108028e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM1)
108128e30874SSatish Balay 
108228e30874SSatish Balay       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
108328e30874SSatish Balay 
108428e30874SSatish Balay       /* Third Column */
108528e30874SSatish Balay       SSE_COPY_PS(XMM2, XMM6)
108628e30874SSatish Balay       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
108728e30874SSatish Balay       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
108828e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM2)
108928e30874SSatish Balay 
109028e30874SSatish Balay       /* Fourth Column */
109128e30874SSatish Balay       SSE_COPY_PS(XMM3, XMM6)
109228e30874SSatish Balay       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
109328e30874SSatish Balay       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
109428e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM3)
109528e30874SSatish Balay       SSE_INLINE_END_2
109628e30874SSatish Balay 
109728e30874SSatish Balay       v += 16;
109828e30874SSatish Balay     }
109928e30874SSatish Balay     idx = 4 * i;
110028e30874SSatish Balay     v   = aa + 16 * ai[++i];
110128e30874SSatish Balay     PREFETCH_NTA(v);
110228e30874SSatish Balay     STORE_PS(tmps, XMM7);
110328e30874SSatish Balay 
110428e30874SSatish Balay     /* Promote result from float to double */
110528e30874SSatish Balay     CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps);
110628e30874SSatish Balay   }
110728e30874SSatish Balay   /* backward solve the upper triangular */
110828e30874SSatish Balay   idt  = 4 * (n - 1);
110928e30874SSatish Balay   ai16 = 16 * diag[n - 1];
111028e30874SSatish Balay   v    = aa + ai16 + 16;
111128e30874SSatish Balay   for (i = n - 1; i >= 0;) {
111228e30874SSatish Balay     PREFETCH_NTA(&v[8]);
111328e30874SSatish Balay     vi = aj + diag[i] + 1;
111428e30874SSatish Balay     nz = ai[i + 1] - diag[i] - 1;
111528e30874SSatish Balay 
111628e30874SSatish Balay     /* Demote accumulator from double to float */
111728e30874SSatish Balay     CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]);
111828e30874SSatish Balay     LOAD_PS(tmps, XMM7);
111928e30874SSatish Balay 
112028e30874SSatish Balay     while (nz--) {
112128e30874SSatish Balay       PREFETCH_NTA(&v[16]);
112228e30874SSatish Balay       idx = 4 * (*vi++);
112328e30874SSatish Balay 
112428e30874SSatish Balay       /* Demote solution (so far) from double to float */
112528e30874SSatish Balay       CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]);
112628e30874SSatish Balay 
112728e30874SSatish Balay       /* 4x4 Matrix-Vector Product with negative accumulation: */
112828e30874SSatish Balay       SSE_INLINE_BEGIN_2(tmpx, v)
112928e30874SSatish Balay       SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6)
113028e30874SSatish Balay 
113128e30874SSatish Balay       /* First Column */
113228e30874SSatish Balay       SSE_COPY_PS(XMM0, XMM6)
113328e30874SSatish Balay       SSE_SHUFFLE(XMM0, XMM0, 0x00)
113428e30874SSatish Balay       SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0)
113528e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM0)
113628e30874SSatish Balay 
113728e30874SSatish Balay       /* Second Column */
113828e30874SSatish Balay       SSE_COPY_PS(XMM1, XMM6)
113928e30874SSatish Balay       SSE_SHUFFLE(XMM1, XMM1, 0x55)
114028e30874SSatish Balay       SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4)
114128e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM1)
114228e30874SSatish Balay 
114328e30874SSatish Balay       SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24)
114428e30874SSatish Balay 
114528e30874SSatish Balay       /* Third Column */
114628e30874SSatish Balay       SSE_COPY_PS(XMM2, XMM6)
114728e30874SSatish Balay       SSE_SHUFFLE(XMM2, XMM2, 0xAA)
114828e30874SSatish Balay       SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8)
114928e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM2)
115028e30874SSatish Balay 
115128e30874SSatish Balay       /* Fourth Column */
115228e30874SSatish Balay       SSE_COPY_PS(XMM3, XMM6)
115328e30874SSatish Balay       SSE_SHUFFLE(XMM3, XMM3, 0xFF)
115428e30874SSatish Balay       SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12)
115528e30874SSatish Balay       SSE_SUB_PS(XMM7, XMM3)
115628e30874SSatish Balay       SSE_INLINE_END_2
115728e30874SSatish Balay       v += 16;
115828e30874SSatish Balay     }
115928e30874SSatish Balay     v    = aa + ai16;
116028e30874SSatish Balay     ai16 = 16 * diag[--i];
116128e30874SSatish Balay     PREFETCH_NTA(aa + ai16 + 16);
116228e30874SSatish Balay     /*
116328e30874SSatish Balay        Scale the result by the diagonal 4x4 block,
116428e30874SSatish Balay        which was inverted as part of the factorization
116528e30874SSatish Balay     */
116628e30874SSatish Balay     SSE_INLINE_BEGIN_3(v, tmps, aa + ai16)
116728e30874SSatish Balay     /* First Column */
116828e30874SSatish Balay     SSE_COPY_PS(XMM0, XMM7)
116928e30874SSatish Balay     SSE_SHUFFLE(XMM0, XMM0, 0x00)
117028e30874SSatish Balay     SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0)
117128e30874SSatish Balay 
117228e30874SSatish Balay     /* Second Column */
117328e30874SSatish Balay     SSE_COPY_PS(XMM1, XMM7)
117428e30874SSatish Balay     SSE_SHUFFLE(XMM1, XMM1, 0x55)
117528e30874SSatish Balay     SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4)
117628e30874SSatish Balay     SSE_ADD_PS(XMM0, XMM1)
117728e30874SSatish Balay 
117828e30874SSatish Balay     SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24)
117928e30874SSatish Balay 
118028e30874SSatish Balay     /* Third Column */
118128e30874SSatish Balay     SSE_COPY_PS(XMM2, XMM7)
118228e30874SSatish Balay     SSE_SHUFFLE(XMM2, XMM2, 0xAA)
118328e30874SSatish Balay     SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8)
118428e30874SSatish Balay     SSE_ADD_PS(XMM0, XMM2)
118528e30874SSatish Balay 
118628e30874SSatish Balay     /* Fourth Column */
118728e30874SSatish Balay     SSE_COPY_PS(XMM3, XMM7)
118828e30874SSatish Balay     SSE_SHUFFLE(XMM3, XMM3, 0xFF)
118928e30874SSatish Balay     SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12)
119028e30874SSatish Balay     SSE_ADD_PS(XMM0, XMM3)
119128e30874SSatish Balay 
119228e30874SSatish Balay     SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0)
119328e30874SSatish Balay     SSE_INLINE_END_3
119428e30874SSatish Balay 
119528e30874SSatish Balay     /* Promote solution from float to double */
119628e30874SSatish Balay     CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps);
119728e30874SSatish Balay 
119828e30874SSatish Balay     /* Apply reordering to t and stream into x.    */
119928e30874SSatish Balay     /* This way, x doesn't pollute the cache.      */
120028e30874SSatish Balay     /* Be careful with size: 2 doubles = 4 floats! */
120128e30874SSatish Balay     idc = 4 * (*c--);
120228e30874SSatish Balay     SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc])
120328e30874SSatish Balay     /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
120428e30874SSatish Balay     SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0)
120528e30874SSatish Balay     SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0)
120628e30874SSatish Balay     /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
120728e30874SSatish Balay     SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1)
120828e30874SSatish Balay     SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1)
120928e30874SSatish Balay     SSE_INLINE_END_2
121028e30874SSatish Balay     v = aa + ai16 + 16;
121128e30874SSatish Balay     idt -= 4;
121228e30874SSatish Balay   }
121328e30874SSatish Balay 
12149566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
12159566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
12169566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(bb, &b));
12179566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
12189566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n));
121928e30874SSatish Balay   SSE_SCOPE_END;
1220*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
122128e30874SSatish Balay }
122228e30874SSatish Balay 
122328e30874SSatish Balay #endif
122428e30874SSatish Balay 
1225d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx)
1226d71ae5a4SJacob Faibussowitsch {
122728e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
122828e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
122928e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
123028e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
123128e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
123228e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
123328e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
123428e30874SSatish Balay   const PetscScalar *b;
123528e30874SSatish Balay 
123628e30874SSatish Balay   PetscFunctionBegin;
12379566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
12389566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
123928e30874SSatish Balay   t = a->solve_work;
124028e30874SSatish Balay 
12419371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
12429371c9d4SSatish Balay   r = rout;
12439371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
12449371c9d4SSatish Balay   c = cout + (n - 1);
124528e30874SSatish Balay 
124628e30874SSatish Balay   /* forward solve the lower triangular */
124728e30874SSatish Balay   idx  = 3 * (*r++);
12489371c9d4SSatish Balay   t[0] = b[idx];
12499371c9d4SSatish Balay   t[1] = b[1 + idx];
12509371c9d4SSatish Balay   t[2] = b[2 + idx];
125128e30874SSatish Balay   for (i = 1; i < n; i++) {
125228e30874SSatish Balay     v   = aa + 9 * ai[i];
125328e30874SSatish Balay     vi  = aj + ai[i];
125428e30874SSatish Balay     nz  = diag[i] - ai[i];
125528e30874SSatish Balay     idx = 3 * (*r++);
12569371c9d4SSatish Balay     s1  = b[idx];
12579371c9d4SSatish Balay     s2  = b[1 + idx];
12589371c9d4SSatish Balay     s3  = b[2 + idx];
125928e30874SSatish Balay     while (nz--) {
126028e30874SSatish Balay       idx = 3 * (*vi++);
12619371c9d4SSatish Balay       x1  = t[idx];
12629371c9d4SSatish Balay       x2  = t[1 + idx];
12639371c9d4SSatish Balay       x3  = t[2 + idx];
126428e30874SSatish Balay       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
126528e30874SSatish Balay       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
126628e30874SSatish Balay       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
126728e30874SSatish Balay       v += 9;
126828e30874SSatish Balay     }
126928e30874SSatish Balay     idx        = 3 * i;
12709371c9d4SSatish Balay     t[idx]     = s1;
12719371c9d4SSatish Balay     t[1 + idx] = s2;
12729371c9d4SSatish Balay     t[2 + idx] = s3;
127328e30874SSatish Balay   }
127428e30874SSatish Balay   /* backward solve the upper triangular */
127528e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
127628e30874SSatish Balay     v   = aa + 9 * diag[i] + 9;
127728e30874SSatish Balay     vi  = aj + diag[i] + 1;
127828e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
127928e30874SSatish Balay     idt = 3 * i;
12809371c9d4SSatish Balay     s1  = t[idt];
12819371c9d4SSatish Balay     s2  = t[1 + idt];
12829371c9d4SSatish Balay     s3  = t[2 + idt];
128328e30874SSatish Balay     while (nz--) {
128428e30874SSatish Balay       idx = 3 * (*vi++);
12859371c9d4SSatish Balay       x1  = t[idx];
12869371c9d4SSatish Balay       x2  = t[1 + idx];
12879371c9d4SSatish Balay       x3  = t[2 + idx];
128828e30874SSatish Balay       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
128928e30874SSatish Balay       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
129028e30874SSatish Balay       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
129128e30874SSatish Balay       v += 9;
129228e30874SSatish Balay     }
129328e30874SSatish Balay     idc    = 3 * (*c--);
129428e30874SSatish Balay     v      = aa + 9 * diag[i];
129528e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
129628e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
129728e30874SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
129828e30874SSatish Balay   }
12999566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
13009566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
13019566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
13029566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
13039566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1304*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
130528e30874SSatish Balay }
130628e30874SSatish Balay 
1307d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx)
1308d71ae5a4SJacob Faibussowitsch {
130928e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
131028e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
131128e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
131228e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc, m;
131328e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
131428e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
131528e30874SSatish Balay   PetscScalar       *x, s1, s2, s3, x1, x2, x3, *t;
131628e30874SSatish Balay   const PetscScalar *b;
131728e30874SSatish Balay 
131828e30874SSatish Balay   PetscFunctionBegin;
13199566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
13209566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
132128e30874SSatish Balay   t = a->solve_work;
132228e30874SSatish Balay 
13239371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
13249371c9d4SSatish Balay   r = rout;
13259371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
13269371c9d4SSatish Balay   c = cout;
132728e30874SSatish Balay 
132828e30874SSatish Balay   /* forward solve the lower triangular */
132928e30874SSatish Balay   idx  = 3 * r[0];
13309371c9d4SSatish Balay   t[0] = b[idx];
13319371c9d4SSatish Balay   t[1] = b[1 + idx];
13329371c9d4SSatish Balay   t[2] = b[2 + idx];
133328e30874SSatish Balay   for (i = 1; i < n; i++) {
133428e30874SSatish Balay     v   = aa + 9 * ai[i];
133528e30874SSatish Balay     vi  = aj + ai[i];
133628e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
133728e30874SSatish Balay     idx = 3 * r[i];
13389371c9d4SSatish Balay     s1  = b[idx];
13399371c9d4SSatish Balay     s2  = b[1 + idx];
13409371c9d4SSatish Balay     s3  = b[2 + idx];
134128e30874SSatish Balay     for (m = 0; m < nz; m++) {
134228e30874SSatish Balay       idx = 3 * vi[m];
13439371c9d4SSatish Balay       x1  = t[idx];
13449371c9d4SSatish Balay       x2  = t[1 + idx];
13459371c9d4SSatish Balay       x3  = t[2 + idx];
134628e30874SSatish Balay       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
134728e30874SSatish Balay       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
134828e30874SSatish Balay       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
134928e30874SSatish Balay       v += 9;
135028e30874SSatish Balay     }
135128e30874SSatish Balay     idx        = 3 * i;
13529371c9d4SSatish Balay     t[idx]     = s1;
13539371c9d4SSatish Balay     t[1 + idx] = s2;
13549371c9d4SSatish Balay     t[2 + idx] = s3;
135528e30874SSatish Balay   }
135628e30874SSatish Balay   /* backward solve the upper triangular */
135728e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
135828e30874SSatish Balay     v   = aa + 9 * (adiag[i + 1] + 1);
135928e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
136028e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
136128e30874SSatish Balay     idt = 3 * i;
13629371c9d4SSatish Balay     s1  = t[idt];
13639371c9d4SSatish Balay     s2  = t[1 + idt];
13649371c9d4SSatish Balay     s3  = t[2 + idt];
136528e30874SSatish Balay     for (m = 0; m < nz; m++) {
136628e30874SSatish Balay       idx = 3 * vi[m];
13679371c9d4SSatish Balay       x1  = t[idx];
13689371c9d4SSatish Balay       x2  = t[1 + idx];
13699371c9d4SSatish Balay       x3  = t[2 + idx];
137028e30874SSatish Balay       s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3;
137128e30874SSatish Balay       s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3;
137228e30874SSatish Balay       s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3;
137328e30874SSatish Balay       v += 9;
137428e30874SSatish Balay     }
137528e30874SSatish Balay     idc    = 3 * c[i];
137628e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3;
137728e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3;
137828e30874SSatish Balay     x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3;
137928e30874SSatish Balay   }
13809566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
13819566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
13829566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
13839566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
13849566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n));
1385*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
138628e30874SSatish Balay }
138728e30874SSatish Balay 
1388d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx)
1389d71ae5a4SJacob Faibussowitsch {
139028e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
139128e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
139228e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
139328e30874SSatish Balay   PetscInt           i, nz, idx, idt, idc;
139428e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
139528e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
139628e30874SSatish Balay   PetscScalar       *x, s1, s2, x1, x2, *t;
139728e30874SSatish Balay   const PetscScalar *b;
139828e30874SSatish Balay 
139928e30874SSatish Balay   PetscFunctionBegin;
14009566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
14019566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
140228e30874SSatish Balay   t = a->solve_work;
140328e30874SSatish Balay 
14049371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
14059371c9d4SSatish Balay   r = rout;
14069371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
14079371c9d4SSatish Balay   c = cout + (n - 1);
140828e30874SSatish Balay 
140928e30874SSatish Balay   /* forward solve the lower triangular */
141028e30874SSatish Balay   idx  = 2 * (*r++);
14119371c9d4SSatish Balay   t[0] = b[idx];
14129371c9d4SSatish Balay   t[1] = b[1 + idx];
141328e30874SSatish Balay   for (i = 1; i < n; i++) {
141428e30874SSatish Balay     v   = aa + 4 * ai[i];
141528e30874SSatish Balay     vi  = aj + ai[i];
141628e30874SSatish Balay     nz  = diag[i] - ai[i];
141728e30874SSatish Balay     idx = 2 * (*r++);
14189371c9d4SSatish Balay     s1  = b[idx];
14199371c9d4SSatish Balay     s2  = b[1 + idx];
142028e30874SSatish Balay     while (nz--) {
142128e30874SSatish Balay       idx = 2 * (*vi++);
14229371c9d4SSatish Balay       x1  = t[idx];
14239371c9d4SSatish Balay       x2  = t[1 + idx];
142428e30874SSatish Balay       s1 -= v[0] * x1 + v[2] * x2;
142528e30874SSatish Balay       s2 -= v[1] * x1 + v[3] * x2;
142628e30874SSatish Balay       v += 4;
142728e30874SSatish Balay     }
142828e30874SSatish Balay     idx        = 2 * i;
14299371c9d4SSatish Balay     t[idx]     = s1;
14309371c9d4SSatish Balay     t[1 + idx] = s2;
143128e30874SSatish Balay   }
143228e30874SSatish Balay   /* backward solve the upper triangular */
143328e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
143428e30874SSatish Balay     v   = aa + 4 * diag[i] + 4;
143528e30874SSatish Balay     vi  = aj + diag[i] + 1;
143628e30874SSatish Balay     nz  = ai[i + 1] - diag[i] - 1;
143728e30874SSatish Balay     idt = 2 * i;
14389371c9d4SSatish Balay     s1  = t[idt];
14399371c9d4SSatish Balay     s2  = t[1 + idt];
144028e30874SSatish Balay     while (nz--) {
144128e30874SSatish Balay       idx = 2 * (*vi++);
14429371c9d4SSatish Balay       x1  = t[idx];
14439371c9d4SSatish Balay       x2  = t[1 + idx];
144428e30874SSatish Balay       s1 -= v[0] * x1 + v[2] * x2;
144528e30874SSatish Balay       s2 -= v[1] * x1 + v[3] * x2;
144628e30874SSatish Balay       v += 4;
144728e30874SSatish Balay     }
144828e30874SSatish Balay     idc    = 2 * (*c--);
144928e30874SSatish Balay     v      = aa + 4 * diag[i];
145028e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
145128e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
145228e30874SSatish Balay   }
14539566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
14549566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
14559566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
14569566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
14579566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1458*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
145928e30874SSatish Balay }
146028e30874SSatish Balay 
1461d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx)
1462d71ae5a4SJacob Faibussowitsch {
146328e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
146428e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
146528e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag;
146628e30874SSatish Balay   PetscInt           i, nz, idx, jdx, idt, idc, m;
146728e30874SSatish Balay   const PetscInt    *r, *c, *rout, *cout;
146828e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
146928e30874SSatish Balay   PetscScalar       *x, s1, s2, x1, x2, *t;
147028e30874SSatish Balay   const PetscScalar *b;
147128e30874SSatish Balay 
147228e30874SSatish Balay   PetscFunctionBegin;
14739566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
14749566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
147528e30874SSatish Balay   t = a->solve_work;
147628e30874SSatish Balay 
14779371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
14789371c9d4SSatish Balay   r = rout;
14799371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
14809371c9d4SSatish Balay   c = cout;
148128e30874SSatish Balay 
148228e30874SSatish Balay   /* forward solve the lower triangular */
148328e30874SSatish Balay   idx  = 2 * r[0];
14849371c9d4SSatish Balay   t[0] = b[idx];
14859371c9d4SSatish Balay   t[1] = b[1 + idx];
148628e30874SSatish Balay   for (i = 1; i < n; i++) {
148728e30874SSatish Balay     v   = aa + 4 * ai[i];
148828e30874SSatish Balay     vi  = aj + ai[i];
148928e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
149028e30874SSatish Balay     idx = 2 * r[i];
14919371c9d4SSatish Balay     s1  = b[idx];
14929371c9d4SSatish Balay     s2  = b[1 + idx];
149328e30874SSatish Balay     for (m = 0; m < nz; m++) {
149428e30874SSatish Balay       jdx = 2 * vi[m];
14959371c9d4SSatish Balay       x1  = t[jdx];
14969371c9d4SSatish Balay       x2  = t[1 + jdx];
149728e30874SSatish Balay       s1 -= v[0] * x1 + v[2] * x2;
149828e30874SSatish Balay       s2 -= v[1] * x1 + v[3] * x2;
149928e30874SSatish Balay       v += 4;
150028e30874SSatish Balay     }
150128e30874SSatish Balay     idx        = 2 * i;
15029371c9d4SSatish Balay     t[idx]     = s1;
15039371c9d4SSatish Balay     t[1 + idx] = s2;
150428e30874SSatish Balay   }
150528e30874SSatish Balay   /* backward solve the upper triangular */
150628e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
150728e30874SSatish Balay     v   = aa + 4 * (adiag[i + 1] + 1);
150828e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
150928e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
151028e30874SSatish Balay     idt = 2 * i;
15119371c9d4SSatish Balay     s1  = t[idt];
15129371c9d4SSatish Balay     s2  = t[1 + idt];
151328e30874SSatish Balay     for (m = 0; m < nz; m++) {
151428e30874SSatish Balay       idx = 2 * vi[m];
15159371c9d4SSatish Balay       x1  = t[idx];
15169371c9d4SSatish Balay       x2  = t[1 + idx];
151728e30874SSatish Balay       s1 -= v[0] * x1 + v[2] * x2;
151828e30874SSatish Balay       s2 -= v[1] * x1 + v[3] * x2;
151928e30874SSatish Balay       v += 4;
152028e30874SSatish Balay     }
152128e30874SSatish Balay     idc    = 2 * c[i];
152228e30874SSatish Balay     x[idc] = t[idt] = v[0] * s1 + v[2] * s2;
152328e30874SSatish Balay     x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2;
152428e30874SSatish Balay   }
15259566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
15269566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
15279566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
15289566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
15299566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n));
1530*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
153128e30874SSatish Balay }
153228e30874SSatish Balay 
1533d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx)
1534d71ae5a4SJacob Faibussowitsch {
153528e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
153628e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
153728e30874SSatish Balay   const PetscInt     n = a->mbs, *vi, *ai = a->i, *aj = a->j;
153828e30874SSatish Balay   PetscInt           i, nz;
153928e30874SSatish Balay   const PetscInt    *r, *c, *diag = a->diag, *rout, *cout;
154028e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
154128e30874SSatish Balay   PetscScalar       *x, s1, *t;
154228e30874SSatish Balay   const PetscScalar *b;
154328e30874SSatish Balay 
154428e30874SSatish Balay   PetscFunctionBegin;
1545*3ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
154628e30874SSatish Balay 
15479566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
15489566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
154928e30874SSatish Balay   t = a->solve_work;
155028e30874SSatish Balay 
15519371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
15529371c9d4SSatish Balay   r = rout;
15539371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
15549371c9d4SSatish Balay   c = cout + (n - 1);
155528e30874SSatish Balay 
155628e30874SSatish Balay   /* forward solve the lower triangular */
155728e30874SSatish Balay   t[0] = b[*r++];
155828e30874SSatish Balay   for (i = 1; i < n; i++) {
155928e30874SSatish Balay     v  = aa + ai[i];
156028e30874SSatish Balay     vi = aj + ai[i];
156128e30874SSatish Balay     nz = diag[i] - ai[i];
156228e30874SSatish Balay     s1 = b[*r++];
1563ad540459SPierre Jolivet     while (nz--) s1 -= (*v++) * t[*vi++];
156428e30874SSatish Balay     t[i] = s1;
156528e30874SSatish Balay   }
156628e30874SSatish Balay   /* backward solve the upper triangular */
156728e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
156828e30874SSatish Balay     v  = aa + diag[i] + 1;
156928e30874SSatish Balay     vi = aj + diag[i] + 1;
157028e30874SSatish Balay     nz = ai[i + 1] - diag[i] - 1;
157128e30874SSatish Balay     s1 = t[i];
1572ad540459SPierre Jolivet     while (nz--) s1 -= (*v++) * t[*vi++];
157328e30874SSatish Balay     x[*c--] = t[i] = aa[diag[i]] * s1;
157428e30874SSatish Balay   }
157528e30874SSatish Balay 
15769566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
15779566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
15789566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
15799566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
15809566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n));
1581*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
158228e30874SSatish Balay }
158328e30874SSatish Balay 
1584d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx)
1585d71ae5a4SJacob Faibussowitsch {
158628e30874SSatish Balay   Mat_SeqBAIJ       *a     = (Mat_SeqBAIJ *)A->data;
158728e30874SSatish Balay   IS                 iscol = a->col, isrow = a->row;
158828e30874SSatish Balay   PetscInt           i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz;
158928e30874SSatish Balay   const PetscInt    *rout, *cout, *r, *c;
159028e30874SSatish Balay   PetscScalar       *x, *tmp, sum;
159128e30874SSatish Balay   const PetscScalar *b;
159228e30874SSatish Balay   const MatScalar   *aa = a->a, *v;
159328e30874SSatish Balay 
159428e30874SSatish Balay   PetscFunctionBegin;
1595*3ba16761SJacob Faibussowitsch   if (!n) PetscFunctionReturn(PETSC_SUCCESS);
159628e30874SSatish Balay 
15979566063dSJacob Faibussowitsch   PetscCall(VecGetArrayRead(bb, &b));
15989566063dSJacob Faibussowitsch   PetscCall(VecGetArray(xx, &x));
159928e30874SSatish Balay   tmp = a->solve_work;
160028e30874SSatish Balay 
16019371c9d4SSatish Balay   PetscCall(ISGetIndices(isrow, &rout));
16029371c9d4SSatish Balay   r = rout;
16039371c9d4SSatish Balay   PetscCall(ISGetIndices(iscol, &cout));
16049371c9d4SSatish Balay   c = cout;
160528e30874SSatish Balay 
160628e30874SSatish Balay   /* forward solve the lower triangular */
160728e30874SSatish Balay   tmp[0] = b[r[0]];
160828e30874SSatish Balay   v      = aa;
160928e30874SSatish Balay   vi     = aj;
161028e30874SSatish Balay   for (i = 1; i < n; i++) {
161128e30874SSatish Balay     nz  = ai[i + 1] - ai[i];
161228e30874SSatish Balay     sum = b[r[i]];
161328e30874SSatish Balay     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
161428e30874SSatish Balay     tmp[i] = sum;
16159371c9d4SSatish Balay     v += nz;
16169371c9d4SSatish Balay     vi += nz;
161728e30874SSatish Balay   }
161828e30874SSatish Balay 
161928e30874SSatish Balay   /* backward solve the upper triangular */
162028e30874SSatish Balay   for (i = n - 1; i >= 0; i--) {
162128e30874SSatish Balay     v   = aa + adiag[i + 1] + 1;
162228e30874SSatish Balay     vi  = aj + adiag[i + 1] + 1;
162328e30874SSatish Balay     nz  = adiag[i] - adiag[i + 1] - 1;
162428e30874SSatish Balay     sum = tmp[i];
162528e30874SSatish Balay     PetscSparseDenseMinusDot(sum, tmp, v, vi, nz);
162628e30874SSatish Balay     x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */
162728e30874SSatish Balay   }
162828e30874SSatish Balay 
16299566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(isrow, &rout));
16309566063dSJacob Faibussowitsch   PetscCall(ISRestoreIndices(iscol, &cout));
16319566063dSJacob Faibussowitsch   PetscCall(VecRestoreArrayRead(bb, &b));
16329566063dSJacob Faibussowitsch   PetscCall(VecRestoreArray(xx, &x));
16339566063dSJacob Faibussowitsch   PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n));
1634*3ba16761SJacob Faibussowitsch   PetscFunctionReturn(PETSC_SUCCESS);
163528e30874SSatish Balay }
1636