128e30874SSatish Balay #include <../src/mat/impls/baij/seq/baij.h> 2af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 328e30874SSatish Balay 49371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx) { 528e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 628e30874SSatish Balay IS iscol = a->col, isrow = a->row; 728e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 828e30874SSatish Balay const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *vi; 928e30874SSatish Balay PetscInt i, nz; 1028e30874SSatish Balay const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 1128e30874SSatish Balay const MatScalar *aa = a->a, *v; 1228e30874SSatish Balay PetscScalar *x, *s, *t, *ls; 1328e30874SSatish Balay const PetscScalar *b; 1428e30874SSatish Balay 1528e30874SSatish Balay PetscFunctionBegin; 169566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 179566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 1828e30874SSatish Balay t = a->solve_work; 1928e30874SSatish Balay 209371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 219371c9d4SSatish Balay r = rout; 229371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 239371c9d4SSatish Balay c = cout + (n - 1); 2428e30874SSatish Balay 2528e30874SSatish Balay /* forward solve the lower triangular */ 269566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t, b + bs * (*r++), bs)); 2728e30874SSatish Balay for (i = 1; i < n; i++) { 2828e30874SSatish Balay v = aa + bs2 * ai[i]; 2928e30874SSatish Balay vi = aj + ai[i]; 3028e30874SSatish Balay nz = a->diag[i] - ai[i]; 3128e30874SSatish Balay s = t + bs * i; 329566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(s, b + bs * (*r++), bs)); 3328e30874SSatish Balay while (nz--) { 3428e30874SSatish Balay PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++)); 3528e30874SSatish Balay v += bs2; 3628e30874SSatish Balay } 3728e30874SSatish Balay } 3828e30874SSatish Balay /* backward solve the upper triangular */ 3928e30874SSatish Balay ls = a->solve_work + A->cmap->n; 4028e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 4128e30874SSatish Balay v = aa + bs2 * (a->diag[i] + 1); 4228e30874SSatish Balay vi = aj + a->diag[i] + 1; 4328e30874SSatish Balay nz = ai[i + 1] - a->diag[i] - 1; 449566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 4528e30874SSatish Balay while (nz--) { 4628e30874SSatish Balay PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++)); 4728e30874SSatish Balay v += bs2; 4828e30874SSatish Balay } 4928e30874SSatish Balay PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs); 509566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs)); 5128e30874SSatish Balay } 5228e30874SSatish Balay 539566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 549566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 559566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 569566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 579566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 5828e30874SSatish Balay PetscFunctionReturn(0); 5928e30874SSatish Balay } 6028e30874SSatish Balay 619371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx) { 6228e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6328e30874SSatish Balay IS iscol = a->col, isrow = a->row; 6428e30874SSatish Balay const PetscInt *r, *c, *ai = a->i, *aj = a->j; 6528e30874SSatish Balay const PetscInt *rout, *cout, *diag = a->diag, *vi, n = a->mbs; 6628e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 6728e30874SSatish Balay const MatScalar *aa = a->a, *v; 6828e30874SSatish Balay PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 6928e30874SSatish Balay const PetscScalar *b; 7028e30874SSatish Balay 7128e30874SSatish Balay PetscFunctionBegin; 729566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 739566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 7428e30874SSatish Balay t = a->solve_work; 7528e30874SSatish Balay 769371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 779371c9d4SSatish Balay r = rout; 789371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 799371c9d4SSatish Balay c = cout + (n - 1); 8028e30874SSatish Balay 8128e30874SSatish Balay /* forward solve the lower triangular */ 8228e30874SSatish Balay idx = 7 * (*r++); 839371c9d4SSatish Balay t[0] = b[idx]; 849371c9d4SSatish Balay t[1] = b[1 + idx]; 859371c9d4SSatish Balay t[2] = b[2 + idx]; 869371c9d4SSatish Balay t[3] = b[3 + idx]; 879371c9d4SSatish Balay t[4] = b[4 + idx]; 889371c9d4SSatish Balay t[5] = b[5 + idx]; 899371c9d4SSatish Balay t[6] = b[6 + idx]; 9028e30874SSatish Balay 9128e30874SSatish Balay for (i = 1; i < n; i++) { 9228e30874SSatish Balay v = aa + 49 * ai[i]; 9328e30874SSatish Balay vi = aj + ai[i]; 9428e30874SSatish Balay nz = diag[i] - ai[i]; 9528e30874SSatish Balay idx = 7 * (*r++); 969371c9d4SSatish Balay s1 = b[idx]; 979371c9d4SSatish Balay s2 = b[1 + idx]; 989371c9d4SSatish Balay s3 = b[2 + idx]; 999371c9d4SSatish Balay s4 = b[3 + idx]; 1009371c9d4SSatish Balay s5 = b[4 + idx]; 1019371c9d4SSatish Balay s6 = b[5 + idx]; 1029371c9d4SSatish Balay s7 = b[6 + idx]; 10328e30874SSatish Balay while (nz--) { 10428e30874SSatish Balay idx = 7 * (*vi++); 1059371c9d4SSatish Balay x1 = t[idx]; 1069371c9d4SSatish Balay x2 = t[1 + idx]; 1079371c9d4SSatish Balay x3 = t[2 + idx]; 1089371c9d4SSatish Balay x4 = t[3 + idx]; 1099371c9d4SSatish Balay x5 = t[4 + idx]; 1109371c9d4SSatish Balay x6 = t[5 + idx]; 1119371c9d4SSatish Balay x7 = t[6 + idx]; 11228e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 11328e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 11428e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 11528e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 11628e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 11728e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 11828e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 11928e30874SSatish Balay v += 49; 12028e30874SSatish Balay } 12128e30874SSatish Balay idx = 7 * i; 1229371c9d4SSatish Balay t[idx] = s1; 1239371c9d4SSatish Balay t[1 + idx] = s2; 1249371c9d4SSatish Balay t[2 + idx] = s3; 1259371c9d4SSatish Balay t[3 + idx] = s4; 1269371c9d4SSatish Balay t[4 + idx] = s5; 1279371c9d4SSatish Balay t[5 + idx] = s6; 1289371c9d4SSatish Balay t[6 + idx] = s7; 12928e30874SSatish Balay } 13028e30874SSatish Balay /* backward solve the upper triangular */ 13128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 13228e30874SSatish Balay v = aa + 49 * diag[i] + 49; 13328e30874SSatish Balay vi = aj + diag[i] + 1; 13428e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 13528e30874SSatish Balay idt = 7 * i; 1369371c9d4SSatish Balay s1 = t[idt]; 1379371c9d4SSatish Balay s2 = t[1 + idt]; 1389371c9d4SSatish Balay s3 = t[2 + idt]; 1399371c9d4SSatish Balay s4 = t[3 + idt]; 1409371c9d4SSatish Balay s5 = t[4 + idt]; 1419371c9d4SSatish Balay s6 = t[5 + idt]; 1429371c9d4SSatish Balay s7 = t[6 + idt]; 14328e30874SSatish Balay while (nz--) { 14428e30874SSatish Balay idx = 7 * (*vi++); 1459371c9d4SSatish Balay x1 = t[idx]; 1469371c9d4SSatish Balay x2 = t[1 + idx]; 1479371c9d4SSatish Balay x3 = t[2 + idx]; 1489371c9d4SSatish Balay x4 = t[3 + idx]; 1499371c9d4SSatish Balay x5 = t[4 + idx]; 1509371c9d4SSatish Balay x6 = t[5 + idx]; 1519371c9d4SSatish Balay x7 = t[6 + idx]; 15228e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 15328e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 15428e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 15528e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 15628e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 15728e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 15828e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 15928e30874SSatish Balay v += 49; 16028e30874SSatish Balay } 16128e30874SSatish Balay idc = 7 * (*c--); 16228e30874SSatish Balay v = aa + 49 * diag[i]; 1639371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 1649371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 1659371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 1669371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 1679371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 1689371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 1699371c9d4SSatish Balay x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 17028e30874SSatish Balay } 17128e30874SSatish Balay 1729566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 1739566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 1749566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 1759566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 1769566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 17728e30874SSatish Balay PetscFunctionReturn(0); 17828e30874SSatish Balay } 17928e30874SSatish Balay 1809371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx) { 18128e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 18228e30874SSatish Balay IS iscol = a->col, isrow = a->row; 18328e30874SSatish Balay const PetscInt *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag; 18428e30874SSatish Balay const PetscInt n = a->mbs, *rout, *cout, *vi; 18528e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 18628e30874SSatish Balay const MatScalar *aa = a->a, *v; 18728e30874SSatish Balay PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 18828e30874SSatish Balay const PetscScalar *b; 18928e30874SSatish Balay 19028e30874SSatish Balay PetscFunctionBegin; 1919566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 1929566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 19328e30874SSatish Balay t = a->solve_work; 19428e30874SSatish Balay 1959371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 1969371c9d4SSatish Balay r = rout; 1979371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 1989371c9d4SSatish Balay c = cout; 19928e30874SSatish Balay 20028e30874SSatish Balay /* forward solve the lower triangular */ 20128e30874SSatish Balay idx = 7 * r[0]; 2029371c9d4SSatish Balay t[0] = b[idx]; 2039371c9d4SSatish Balay t[1] = b[1 + idx]; 2049371c9d4SSatish Balay t[2] = b[2 + idx]; 2059371c9d4SSatish Balay t[3] = b[3 + idx]; 2069371c9d4SSatish Balay t[4] = b[4 + idx]; 2079371c9d4SSatish Balay t[5] = b[5 + idx]; 2089371c9d4SSatish Balay t[6] = b[6 + idx]; 20928e30874SSatish Balay 21028e30874SSatish Balay for (i = 1; i < n; i++) { 21128e30874SSatish Balay v = aa + 49 * ai[i]; 21228e30874SSatish Balay vi = aj + ai[i]; 21328e30874SSatish Balay nz = ai[i + 1] - ai[i]; 21428e30874SSatish Balay idx = 7 * r[i]; 2159371c9d4SSatish Balay s1 = b[idx]; 2169371c9d4SSatish Balay s2 = b[1 + idx]; 2179371c9d4SSatish Balay s3 = b[2 + idx]; 2189371c9d4SSatish Balay s4 = b[3 + idx]; 2199371c9d4SSatish Balay s5 = b[4 + idx]; 2209371c9d4SSatish Balay s6 = b[5 + idx]; 2219371c9d4SSatish Balay s7 = b[6 + idx]; 22228e30874SSatish Balay for (m = 0; m < nz; m++) { 22328e30874SSatish Balay idx = 7 * vi[m]; 2249371c9d4SSatish Balay x1 = t[idx]; 2259371c9d4SSatish Balay x2 = t[1 + idx]; 2269371c9d4SSatish Balay x3 = t[2 + idx]; 2279371c9d4SSatish Balay x4 = t[3 + idx]; 2289371c9d4SSatish Balay x5 = t[4 + idx]; 2299371c9d4SSatish Balay x6 = t[5 + idx]; 2309371c9d4SSatish Balay x7 = t[6 + idx]; 23128e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 23228e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 23328e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 23428e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 23528e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 23628e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 23728e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 23828e30874SSatish Balay v += 49; 23928e30874SSatish Balay } 24028e30874SSatish Balay idx = 7 * i; 2419371c9d4SSatish Balay t[idx] = s1; 2429371c9d4SSatish Balay t[1 + idx] = s2; 2439371c9d4SSatish Balay t[2 + idx] = s3; 2449371c9d4SSatish Balay t[3 + idx] = s4; 2459371c9d4SSatish Balay t[4 + idx] = s5; 2469371c9d4SSatish Balay t[5 + idx] = s6; 2479371c9d4SSatish Balay t[6 + idx] = s7; 24828e30874SSatish Balay } 24928e30874SSatish Balay /* backward solve the upper triangular */ 25028e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 25128e30874SSatish Balay v = aa + 49 * (adiag[i + 1] + 1); 25228e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 25328e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 25428e30874SSatish Balay idt = 7 * i; 2559371c9d4SSatish Balay s1 = t[idt]; 2569371c9d4SSatish Balay s2 = t[1 + idt]; 2579371c9d4SSatish Balay s3 = t[2 + idt]; 2589371c9d4SSatish Balay s4 = t[3 + idt]; 2599371c9d4SSatish Balay s5 = t[4 + idt]; 2609371c9d4SSatish Balay s6 = t[5 + idt]; 2619371c9d4SSatish Balay s7 = t[6 + idt]; 26228e30874SSatish Balay for (m = 0; m < nz; m++) { 26328e30874SSatish Balay idx = 7 * vi[m]; 2649371c9d4SSatish Balay x1 = t[idx]; 2659371c9d4SSatish Balay x2 = t[1 + idx]; 2669371c9d4SSatish Balay x3 = t[2 + idx]; 2679371c9d4SSatish Balay x4 = t[3 + idx]; 2689371c9d4SSatish Balay x5 = t[4 + idx]; 2699371c9d4SSatish Balay x6 = t[5 + idx]; 2709371c9d4SSatish Balay x7 = t[6 + idx]; 27128e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 27228e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 27328e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 27428e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 27528e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 27628e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 27728e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 27828e30874SSatish Balay v += 49; 27928e30874SSatish Balay } 28028e30874SSatish Balay idc = 7 * c[i]; 2819371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 2829371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 2839371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 2849371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 2859371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 2869371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 2879371c9d4SSatish Balay x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 28828e30874SSatish Balay } 28928e30874SSatish Balay 2909566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 2919566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 2929566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 2939566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 2949566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 29528e30874SSatish Balay PetscFunctionReturn(0); 29628e30874SSatish Balay } 29728e30874SSatish Balay 2989371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx) { 29928e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 30028e30874SSatish Balay IS iscol = a->col, isrow = a->row; 30128e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 30228e30874SSatish Balay const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 30328e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 30428e30874SSatish Balay const MatScalar *aa = a->a, *v; 30528e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 30628e30874SSatish Balay const PetscScalar *b; 30728e30874SSatish Balay 30828e30874SSatish Balay PetscFunctionBegin; 3099566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 3109566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 31128e30874SSatish Balay t = a->solve_work; 31228e30874SSatish Balay 3139371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 3149371c9d4SSatish Balay r = rout; 3159371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 3169371c9d4SSatish Balay c = cout + (n - 1); 31728e30874SSatish Balay 31828e30874SSatish Balay /* forward solve the lower triangular */ 31928e30874SSatish Balay idx = 6 * (*r++); 3209371c9d4SSatish Balay t[0] = b[idx]; 3219371c9d4SSatish Balay t[1] = b[1 + idx]; 3229371c9d4SSatish Balay t[2] = b[2 + idx]; 3239371c9d4SSatish Balay t[3] = b[3 + idx]; 3249371c9d4SSatish Balay t[4] = b[4 + idx]; 3259371c9d4SSatish Balay t[5] = b[5 + idx]; 32628e30874SSatish Balay for (i = 1; i < n; i++) { 32728e30874SSatish Balay v = aa + 36 * ai[i]; 32828e30874SSatish Balay vi = aj + ai[i]; 32928e30874SSatish Balay nz = diag[i] - ai[i]; 33028e30874SSatish Balay idx = 6 * (*r++); 3319371c9d4SSatish Balay s1 = b[idx]; 3329371c9d4SSatish Balay s2 = b[1 + idx]; 3339371c9d4SSatish Balay s3 = b[2 + idx]; 3349371c9d4SSatish Balay s4 = b[3 + idx]; 3359371c9d4SSatish Balay s5 = b[4 + idx]; 3369371c9d4SSatish Balay s6 = b[5 + idx]; 33728e30874SSatish Balay while (nz--) { 33828e30874SSatish Balay idx = 6 * (*vi++); 3399371c9d4SSatish Balay x1 = t[idx]; 3409371c9d4SSatish Balay x2 = t[1 + idx]; 3419371c9d4SSatish Balay x3 = t[2 + idx]; 3429371c9d4SSatish Balay x4 = t[3 + idx]; 3439371c9d4SSatish Balay x5 = t[4 + idx]; 3449371c9d4SSatish Balay x6 = t[5 + idx]; 34528e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 34628e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 34728e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 34828e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 34928e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 35028e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 35128e30874SSatish Balay v += 36; 35228e30874SSatish Balay } 35328e30874SSatish Balay idx = 6 * i; 3549371c9d4SSatish Balay t[idx] = s1; 3559371c9d4SSatish Balay t[1 + idx] = s2; 3569371c9d4SSatish Balay t[2 + idx] = s3; 3579371c9d4SSatish Balay t[3 + idx] = s4; 3589371c9d4SSatish Balay t[4 + idx] = s5; 3599371c9d4SSatish Balay t[5 + idx] = s6; 36028e30874SSatish Balay } 36128e30874SSatish Balay /* backward solve the upper triangular */ 36228e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 36328e30874SSatish Balay v = aa + 36 * diag[i] + 36; 36428e30874SSatish Balay vi = aj + diag[i] + 1; 36528e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 36628e30874SSatish Balay idt = 6 * i; 3679371c9d4SSatish Balay s1 = t[idt]; 3689371c9d4SSatish Balay s2 = t[1 + idt]; 3699371c9d4SSatish Balay s3 = t[2 + idt]; 3709371c9d4SSatish Balay s4 = t[3 + idt]; 3719371c9d4SSatish Balay s5 = t[4 + idt]; 3729371c9d4SSatish Balay s6 = t[5 + idt]; 37328e30874SSatish Balay while (nz--) { 37428e30874SSatish Balay idx = 6 * (*vi++); 3759371c9d4SSatish Balay x1 = t[idx]; 3769371c9d4SSatish Balay x2 = t[1 + idx]; 3779371c9d4SSatish Balay x3 = t[2 + idx]; 3789371c9d4SSatish Balay x4 = t[3 + idx]; 3799371c9d4SSatish Balay x5 = t[4 + idx]; 3809371c9d4SSatish Balay x6 = t[5 + idx]; 38128e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 38228e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 38328e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 38428e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 38528e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 38628e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 38728e30874SSatish Balay v += 36; 38828e30874SSatish Balay } 38928e30874SSatish Balay idc = 6 * (*c--); 39028e30874SSatish Balay v = aa + 36 * diag[i]; 3919371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 3929371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 3939371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 3949371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 3959371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 3969371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 39728e30874SSatish Balay } 39828e30874SSatish Balay 3999566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 4009566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 4019566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 4029566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 4039566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 40428e30874SSatish Balay PetscFunctionReturn(0); 40528e30874SSatish Balay } 40628e30874SSatish Balay 4079371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx) { 40828e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 40928e30874SSatish Balay IS iscol = a->col, isrow = a->row; 41028e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 41128e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 41228e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 41328e30874SSatish Balay const MatScalar *aa = a->a, *v; 41428e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 41528e30874SSatish Balay const PetscScalar *b; 41628e30874SSatish Balay 41728e30874SSatish Balay PetscFunctionBegin; 4189566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 4199566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 42028e30874SSatish Balay t = a->solve_work; 42128e30874SSatish Balay 4229371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 4239371c9d4SSatish Balay r = rout; 4249371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 4259371c9d4SSatish Balay c = cout; 42628e30874SSatish Balay 42728e30874SSatish Balay /* forward solve the lower triangular */ 42828e30874SSatish Balay idx = 6 * r[0]; 4299371c9d4SSatish Balay t[0] = b[idx]; 4309371c9d4SSatish Balay t[1] = b[1 + idx]; 4319371c9d4SSatish Balay t[2] = b[2 + idx]; 4329371c9d4SSatish Balay t[3] = b[3 + idx]; 4339371c9d4SSatish Balay t[4] = b[4 + idx]; 4349371c9d4SSatish Balay t[5] = b[5 + idx]; 43528e30874SSatish Balay for (i = 1; i < n; i++) { 43628e30874SSatish Balay v = aa + 36 * ai[i]; 43728e30874SSatish Balay vi = aj + ai[i]; 43828e30874SSatish Balay nz = ai[i + 1] - ai[i]; 43928e30874SSatish Balay idx = 6 * r[i]; 4409371c9d4SSatish Balay s1 = b[idx]; 4419371c9d4SSatish Balay s2 = b[1 + idx]; 4429371c9d4SSatish Balay s3 = b[2 + idx]; 4439371c9d4SSatish Balay s4 = b[3 + idx]; 4449371c9d4SSatish Balay s5 = b[4 + idx]; 4459371c9d4SSatish Balay s6 = b[5 + idx]; 44628e30874SSatish Balay for (m = 0; m < nz; m++) { 44728e30874SSatish Balay idx = 6 * vi[m]; 4489371c9d4SSatish Balay x1 = t[idx]; 4499371c9d4SSatish Balay x2 = t[1 + idx]; 4509371c9d4SSatish Balay x3 = t[2 + idx]; 4519371c9d4SSatish Balay x4 = t[3 + idx]; 4529371c9d4SSatish Balay x5 = t[4 + idx]; 4539371c9d4SSatish Balay x6 = t[5 + idx]; 45428e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 45528e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 45628e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 45728e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 45828e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 45928e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 46028e30874SSatish Balay v += 36; 46128e30874SSatish Balay } 46228e30874SSatish Balay idx = 6 * i; 4639371c9d4SSatish Balay t[idx] = s1; 4649371c9d4SSatish Balay t[1 + idx] = s2; 4659371c9d4SSatish Balay t[2 + idx] = s3; 4669371c9d4SSatish Balay t[3 + idx] = s4; 4679371c9d4SSatish Balay t[4 + idx] = s5; 4689371c9d4SSatish Balay t[5 + idx] = s6; 46928e30874SSatish Balay } 47028e30874SSatish Balay /* backward solve the upper triangular */ 47128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 47228e30874SSatish Balay v = aa + 36 * (adiag[i + 1] + 1); 47328e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 47428e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 47528e30874SSatish Balay idt = 6 * i; 4769371c9d4SSatish Balay s1 = t[idt]; 4779371c9d4SSatish Balay s2 = t[1 + idt]; 4789371c9d4SSatish Balay s3 = t[2 + idt]; 4799371c9d4SSatish Balay s4 = t[3 + idt]; 4809371c9d4SSatish Balay s5 = t[4 + idt]; 4819371c9d4SSatish Balay s6 = t[5 + idt]; 48228e30874SSatish Balay for (m = 0; m < nz; m++) { 48328e30874SSatish Balay idx = 6 * vi[m]; 4849371c9d4SSatish Balay x1 = t[idx]; 4859371c9d4SSatish Balay x2 = t[1 + idx]; 4869371c9d4SSatish Balay x3 = t[2 + idx]; 4879371c9d4SSatish Balay x4 = t[3 + idx]; 4889371c9d4SSatish Balay x5 = t[4 + idx]; 4899371c9d4SSatish Balay x6 = t[5 + idx]; 49028e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 49128e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 49228e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 49328e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 49428e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 49528e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 49628e30874SSatish Balay v += 36; 49728e30874SSatish Balay } 49828e30874SSatish Balay idc = 6 * c[i]; 4999371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 5009371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 5019371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 5029371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 5039371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 5049371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 50528e30874SSatish Balay } 50628e30874SSatish Balay 5079566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 5089566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 5099566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 5109566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 5119566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 51228e30874SSatish Balay PetscFunctionReturn(0); 51328e30874SSatish Balay } 51428e30874SSatish Balay 5159371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx) { 51628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 51728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 51828e30874SSatish Balay const PetscInt *r, *c, *rout, *cout, *diag = a->diag; 51928e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 52028e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 52128e30874SSatish Balay const MatScalar *aa = a->a, *v; 52228e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 52328e30874SSatish Balay const PetscScalar *b; 52428e30874SSatish Balay 52528e30874SSatish Balay PetscFunctionBegin; 5269566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 5279566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 52828e30874SSatish Balay t = a->solve_work; 52928e30874SSatish Balay 5309371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 5319371c9d4SSatish Balay r = rout; 5329371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 5339371c9d4SSatish Balay c = cout + (n - 1); 53428e30874SSatish Balay 53528e30874SSatish Balay /* forward solve the lower triangular */ 53628e30874SSatish Balay idx = 5 * (*r++); 5379371c9d4SSatish Balay t[0] = b[idx]; 5389371c9d4SSatish Balay t[1] = b[1 + idx]; 5399371c9d4SSatish Balay t[2] = b[2 + idx]; 5409371c9d4SSatish Balay t[3] = b[3 + idx]; 5419371c9d4SSatish Balay t[4] = b[4 + idx]; 54228e30874SSatish Balay for (i = 1; i < n; i++) { 54328e30874SSatish Balay v = aa + 25 * ai[i]; 54428e30874SSatish Balay vi = aj + ai[i]; 54528e30874SSatish Balay nz = diag[i] - ai[i]; 54628e30874SSatish Balay idx = 5 * (*r++); 5479371c9d4SSatish Balay s1 = b[idx]; 5489371c9d4SSatish Balay s2 = b[1 + idx]; 5499371c9d4SSatish Balay s3 = b[2 + idx]; 5509371c9d4SSatish Balay s4 = b[3 + idx]; 55128e30874SSatish Balay s5 = b[4 + idx]; 55228e30874SSatish Balay while (nz--) { 55328e30874SSatish Balay idx = 5 * (*vi++); 5549371c9d4SSatish Balay x1 = t[idx]; 5559371c9d4SSatish Balay x2 = t[1 + idx]; 5569371c9d4SSatish Balay x3 = t[2 + idx]; 5579371c9d4SSatish Balay x4 = t[3 + idx]; 5589371c9d4SSatish Balay x5 = t[4 + idx]; 55928e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 56028e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 56128e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 56228e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 56328e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 56428e30874SSatish Balay v += 25; 56528e30874SSatish Balay } 56628e30874SSatish Balay idx = 5 * i; 5679371c9d4SSatish Balay t[idx] = s1; 5689371c9d4SSatish Balay t[1 + idx] = s2; 5699371c9d4SSatish Balay t[2 + idx] = s3; 5709371c9d4SSatish Balay t[3 + idx] = s4; 5719371c9d4SSatish Balay t[4 + idx] = s5; 57228e30874SSatish Balay } 57328e30874SSatish Balay /* backward solve the upper triangular */ 57428e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 57528e30874SSatish Balay v = aa + 25 * diag[i] + 25; 57628e30874SSatish Balay vi = aj + diag[i] + 1; 57728e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 57828e30874SSatish Balay idt = 5 * i; 5799371c9d4SSatish Balay s1 = t[idt]; 5809371c9d4SSatish Balay s2 = t[1 + idt]; 5819371c9d4SSatish Balay s3 = t[2 + idt]; 5829371c9d4SSatish Balay s4 = t[3 + idt]; 5839371c9d4SSatish Balay s5 = t[4 + idt]; 58428e30874SSatish Balay while (nz--) { 58528e30874SSatish Balay idx = 5 * (*vi++); 5869371c9d4SSatish Balay x1 = t[idx]; 5879371c9d4SSatish Balay x2 = t[1 + idx]; 5889371c9d4SSatish Balay x3 = t[2 + idx]; 5899371c9d4SSatish Balay x4 = t[3 + idx]; 5909371c9d4SSatish Balay x5 = t[4 + idx]; 59128e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 59228e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 59328e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 59428e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 59528e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 59628e30874SSatish Balay v += 25; 59728e30874SSatish Balay } 59828e30874SSatish Balay idc = 5 * (*c--); 59928e30874SSatish Balay v = aa + 25 * diag[i]; 6009371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 6019371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 6029371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 6039371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 6049371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 60528e30874SSatish Balay } 60628e30874SSatish Balay 6079566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 6089566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 6099566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 6109566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 6119566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 61228e30874SSatish Balay PetscFunctionReturn(0); 61328e30874SSatish Balay } 61428e30874SSatish Balay 6159371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx) { 61628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 61728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 61828e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 61928e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 62028e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 62128e30874SSatish Balay const MatScalar *aa = a->a, *v; 62228e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 62328e30874SSatish Balay const PetscScalar *b; 62428e30874SSatish Balay 62528e30874SSatish Balay PetscFunctionBegin; 6269566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 6279566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 62828e30874SSatish Balay t = a->solve_work; 62928e30874SSatish Balay 6309371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 6319371c9d4SSatish Balay r = rout; 6329371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 6339371c9d4SSatish Balay c = cout; 63428e30874SSatish Balay 63528e30874SSatish Balay /* forward solve the lower triangular */ 63628e30874SSatish Balay idx = 5 * r[0]; 6379371c9d4SSatish Balay t[0] = b[idx]; 6389371c9d4SSatish Balay t[1] = b[1 + idx]; 6399371c9d4SSatish Balay t[2] = b[2 + idx]; 6409371c9d4SSatish Balay t[3] = b[3 + idx]; 6419371c9d4SSatish Balay t[4] = b[4 + idx]; 64228e30874SSatish Balay for (i = 1; i < n; i++) { 64328e30874SSatish Balay v = aa + 25 * ai[i]; 64428e30874SSatish Balay vi = aj + ai[i]; 64528e30874SSatish Balay nz = ai[i + 1] - ai[i]; 64628e30874SSatish Balay idx = 5 * r[i]; 6479371c9d4SSatish Balay s1 = b[idx]; 6489371c9d4SSatish Balay s2 = b[1 + idx]; 6499371c9d4SSatish Balay s3 = b[2 + idx]; 6509371c9d4SSatish Balay s4 = b[3 + idx]; 65128e30874SSatish Balay s5 = b[4 + idx]; 65228e30874SSatish Balay for (m = 0; m < nz; m++) { 65328e30874SSatish Balay idx = 5 * vi[m]; 6549371c9d4SSatish Balay x1 = t[idx]; 6559371c9d4SSatish Balay x2 = t[1 + idx]; 6569371c9d4SSatish Balay x3 = t[2 + idx]; 6579371c9d4SSatish Balay x4 = t[3 + idx]; 6589371c9d4SSatish Balay x5 = t[4 + idx]; 65928e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 66028e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 66128e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 66228e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 66328e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 66428e30874SSatish Balay v += 25; 66528e30874SSatish Balay } 66628e30874SSatish Balay idx = 5 * i; 6679371c9d4SSatish Balay t[idx] = s1; 6689371c9d4SSatish Balay t[1 + idx] = s2; 6699371c9d4SSatish Balay t[2 + idx] = s3; 6709371c9d4SSatish Balay t[3 + idx] = s4; 6719371c9d4SSatish Balay t[4 + idx] = s5; 67228e30874SSatish Balay } 67328e30874SSatish Balay /* backward solve the upper triangular */ 67428e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 67528e30874SSatish Balay v = aa + 25 * (adiag[i + 1] + 1); 67628e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 67728e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 67828e30874SSatish Balay idt = 5 * i; 6799371c9d4SSatish Balay s1 = t[idt]; 6809371c9d4SSatish Balay s2 = t[1 + idt]; 6819371c9d4SSatish Balay s3 = t[2 + idt]; 6829371c9d4SSatish Balay s4 = t[3 + idt]; 6839371c9d4SSatish Balay s5 = t[4 + idt]; 68428e30874SSatish Balay for (m = 0; m < nz; m++) { 68528e30874SSatish Balay idx = 5 * vi[m]; 6869371c9d4SSatish Balay x1 = t[idx]; 6879371c9d4SSatish Balay x2 = t[1 + idx]; 6889371c9d4SSatish Balay x3 = t[2 + idx]; 6899371c9d4SSatish Balay x4 = t[3 + idx]; 6909371c9d4SSatish Balay x5 = t[4 + idx]; 69128e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 69228e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 69328e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 69428e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 69528e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 69628e30874SSatish Balay v += 25; 69728e30874SSatish Balay } 69828e30874SSatish Balay idc = 5 * c[i]; 6999371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 7009371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 7019371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 7029371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 7039371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 70428e30874SSatish Balay } 70528e30874SSatish Balay 7069566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 7079566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 7089566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 7099566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 7109566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 71128e30874SSatish Balay PetscFunctionReturn(0); 71228e30874SSatish Balay } 71328e30874SSatish Balay 7149371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx) { 71528e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 71628e30874SSatish Balay IS iscol = a->col, isrow = a->row; 71728e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 71828e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 71928e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 72028e30874SSatish Balay const MatScalar *aa = a->a, *v; 72128e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 72228e30874SSatish Balay const PetscScalar *b; 72328e30874SSatish Balay 72428e30874SSatish Balay PetscFunctionBegin; 7259566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 7269566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 72728e30874SSatish Balay t = a->solve_work; 72828e30874SSatish Balay 7299371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 7309371c9d4SSatish Balay r = rout; 7319371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 7329371c9d4SSatish Balay c = cout + (n - 1); 73328e30874SSatish Balay 73428e30874SSatish Balay /* forward solve the lower triangular */ 73528e30874SSatish Balay idx = 4 * (*r++); 7369371c9d4SSatish Balay t[0] = b[idx]; 7379371c9d4SSatish Balay t[1] = b[1 + idx]; 7389371c9d4SSatish Balay t[2] = b[2 + idx]; 7399371c9d4SSatish Balay t[3] = b[3 + idx]; 74028e30874SSatish Balay for (i = 1; i < n; i++) { 74128e30874SSatish Balay v = aa + 16 * ai[i]; 74228e30874SSatish Balay vi = aj + ai[i]; 74328e30874SSatish Balay nz = diag[i] - ai[i]; 74428e30874SSatish Balay idx = 4 * (*r++); 7459371c9d4SSatish Balay s1 = b[idx]; 7469371c9d4SSatish Balay s2 = b[1 + idx]; 7479371c9d4SSatish Balay s3 = b[2 + idx]; 7489371c9d4SSatish Balay s4 = b[3 + idx]; 74928e30874SSatish Balay while (nz--) { 75028e30874SSatish Balay idx = 4 * (*vi++); 7519371c9d4SSatish Balay x1 = t[idx]; 7529371c9d4SSatish Balay x2 = t[1 + idx]; 7539371c9d4SSatish Balay x3 = t[2 + idx]; 7549371c9d4SSatish Balay x4 = t[3 + idx]; 75528e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 75628e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 75728e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 75828e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 75928e30874SSatish Balay v += 16; 76028e30874SSatish Balay } 76128e30874SSatish Balay idx = 4 * i; 7629371c9d4SSatish Balay t[idx] = s1; 7639371c9d4SSatish Balay t[1 + idx] = s2; 7649371c9d4SSatish Balay t[2 + idx] = s3; 7659371c9d4SSatish Balay t[3 + idx] = s4; 76628e30874SSatish Balay } 76728e30874SSatish Balay /* backward solve the upper triangular */ 76828e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 76928e30874SSatish Balay v = aa + 16 * diag[i] + 16; 77028e30874SSatish Balay vi = aj + diag[i] + 1; 77128e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 77228e30874SSatish Balay idt = 4 * i; 7739371c9d4SSatish Balay s1 = t[idt]; 7749371c9d4SSatish Balay s2 = t[1 + idt]; 7759371c9d4SSatish Balay s3 = t[2 + idt]; 7769371c9d4SSatish Balay s4 = t[3 + idt]; 77728e30874SSatish Balay while (nz--) { 77828e30874SSatish Balay idx = 4 * (*vi++); 7799371c9d4SSatish Balay x1 = t[idx]; 7809371c9d4SSatish Balay x2 = t[1 + idx]; 7819371c9d4SSatish Balay x3 = t[2 + idx]; 7829371c9d4SSatish Balay x4 = t[3 + idx]; 78328e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 78428e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 78528e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 78628e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 78728e30874SSatish Balay v += 16; 78828e30874SSatish Balay } 78928e30874SSatish Balay idc = 4 * (*c--); 79028e30874SSatish Balay v = aa + 16 * diag[i]; 79128e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 79228e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 79328e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 79428e30874SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 79528e30874SSatish Balay } 79628e30874SSatish Balay 7979566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 7989566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 7999566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 8009566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 8019566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 80228e30874SSatish Balay PetscFunctionReturn(0); 80328e30874SSatish Balay } 80428e30874SSatish Balay 8059371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx) { 80628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 80728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 80828e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 80928e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 81028e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 81128e30874SSatish Balay const MatScalar *aa = a->a, *v; 81228e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 81328e30874SSatish Balay const PetscScalar *b; 81428e30874SSatish Balay 81528e30874SSatish Balay PetscFunctionBegin; 8169566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 8179566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 81828e30874SSatish Balay t = a->solve_work; 81928e30874SSatish Balay 8209371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 8219371c9d4SSatish Balay r = rout; 8229371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 8239371c9d4SSatish Balay c = cout; 82428e30874SSatish Balay 82528e30874SSatish Balay /* forward solve the lower triangular */ 82628e30874SSatish Balay idx = 4 * r[0]; 8279371c9d4SSatish Balay t[0] = b[idx]; 8289371c9d4SSatish Balay t[1] = b[1 + idx]; 8299371c9d4SSatish Balay t[2] = b[2 + idx]; 8309371c9d4SSatish Balay t[3] = b[3 + idx]; 83128e30874SSatish Balay for (i = 1; i < n; i++) { 83228e30874SSatish Balay v = aa + 16 * ai[i]; 83328e30874SSatish Balay vi = aj + ai[i]; 83428e30874SSatish Balay nz = ai[i + 1] - ai[i]; 83528e30874SSatish Balay idx = 4 * r[i]; 8369371c9d4SSatish Balay s1 = b[idx]; 8379371c9d4SSatish Balay s2 = b[1 + idx]; 8389371c9d4SSatish Balay s3 = b[2 + idx]; 8399371c9d4SSatish Balay s4 = b[3 + idx]; 84028e30874SSatish Balay for (m = 0; m < nz; m++) { 84128e30874SSatish Balay idx = 4 * vi[m]; 8429371c9d4SSatish Balay x1 = t[idx]; 8439371c9d4SSatish Balay x2 = t[1 + idx]; 8449371c9d4SSatish Balay x3 = t[2 + idx]; 8459371c9d4SSatish Balay x4 = t[3 + idx]; 84628e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 84728e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 84828e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 84928e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 85028e30874SSatish Balay v += 16; 85128e30874SSatish Balay } 85228e30874SSatish Balay idx = 4 * i; 8539371c9d4SSatish Balay t[idx] = s1; 8549371c9d4SSatish Balay t[1 + idx] = s2; 8559371c9d4SSatish Balay t[2 + idx] = s3; 8569371c9d4SSatish Balay t[3 + idx] = s4; 85728e30874SSatish Balay } 85828e30874SSatish Balay /* backward solve the upper triangular */ 85928e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 86028e30874SSatish Balay v = aa + 16 * (adiag[i + 1] + 1); 86128e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 86228e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 86328e30874SSatish Balay idt = 4 * i; 8649371c9d4SSatish Balay s1 = t[idt]; 8659371c9d4SSatish Balay s2 = t[1 + idt]; 8669371c9d4SSatish Balay s3 = t[2 + idt]; 8679371c9d4SSatish Balay s4 = t[3 + idt]; 86828e30874SSatish Balay for (m = 0; m < nz; m++) { 86928e30874SSatish Balay idx = 4 * vi[m]; 8709371c9d4SSatish Balay x1 = t[idx]; 8719371c9d4SSatish Balay x2 = t[1 + idx]; 8729371c9d4SSatish Balay x3 = t[2 + idx]; 8739371c9d4SSatish Balay x4 = t[3 + idx]; 87428e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 87528e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 87628e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 87728e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 87828e30874SSatish Balay v += 16; 87928e30874SSatish Balay } 88028e30874SSatish Balay idc = 4 * c[i]; 88128e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 88228e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 88328e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 88428e30874SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 88528e30874SSatish Balay } 88628e30874SSatish Balay 8879566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 8889566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 8899566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 8909566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 8919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 89228e30874SSatish Balay PetscFunctionReturn(0); 89328e30874SSatish Balay } 89428e30874SSatish Balay 8959371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx) { 89628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 89728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 89828e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 89928e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 90028e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 90128e30874SSatish Balay const MatScalar *aa = a->a, *v; 90228e30874SSatish Balay MatScalar s1, s2, s3, s4, x1, x2, x3, x4, *t; 90328e30874SSatish Balay PetscScalar *x; 90428e30874SSatish Balay const PetscScalar *b; 90528e30874SSatish Balay 90628e30874SSatish Balay PetscFunctionBegin; 9079566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 9089566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 90928e30874SSatish Balay t = (MatScalar *)a->solve_work; 91028e30874SSatish Balay 9119371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 9129371c9d4SSatish Balay r = rout; 9139371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 9149371c9d4SSatish Balay c = cout + (n - 1); 91528e30874SSatish Balay 91628e30874SSatish Balay /* forward solve the lower triangular */ 91728e30874SSatish Balay idx = 4 * (*r++); 91828e30874SSatish Balay t[0] = (MatScalar)b[idx]; 91928e30874SSatish Balay t[1] = (MatScalar)b[1 + idx]; 92028e30874SSatish Balay t[2] = (MatScalar)b[2 + idx]; 92128e30874SSatish Balay t[3] = (MatScalar)b[3 + idx]; 92228e30874SSatish Balay for (i = 1; i < n; i++) { 92328e30874SSatish Balay v = aa + 16 * ai[i]; 92428e30874SSatish Balay vi = aj + ai[i]; 92528e30874SSatish Balay nz = diag[i] - ai[i]; 92628e30874SSatish Balay idx = 4 * (*r++); 92728e30874SSatish Balay s1 = (MatScalar)b[idx]; 92828e30874SSatish Balay s2 = (MatScalar)b[1 + idx]; 92928e30874SSatish Balay s3 = (MatScalar)b[2 + idx]; 93028e30874SSatish Balay s4 = (MatScalar)b[3 + idx]; 93128e30874SSatish Balay while (nz--) { 93228e30874SSatish Balay idx = 4 * (*vi++); 93328e30874SSatish Balay x1 = t[idx]; 93428e30874SSatish Balay x2 = t[1 + idx]; 93528e30874SSatish Balay x3 = t[2 + idx]; 93628e30874SSatish Balay x4 = t[3 + idx]; 93728e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 93828e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 93928e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 94028e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 94128e30874SSatish Balay v += 16; 94228e30874SSatish Balay } 94328e30874SSatish Balay idx = 4 * i; 94428e30874SSatish Balay t[idx] = s1; 94528e30874SSatish Balay t[1 + idx] = s2; 94628e30874SSatish Balay t[2 + idx] = s3; 94728e30874SSatish Balay t[3 + idx] = s4; 94828e30874SSatish Balay } 94928e30874SSatish Balay /* backward solve the upper triangular */ 95028e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 95128e30874SSatish Balay v = aa + 16 * diag[i] + 16; 95228e30874SSatish Balay vi = aj + diag[i] + 1; 95328e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 95428e30874SSatish Balay idt = 4 * i; 95528e30874SSatish Balay s1 = t[idt]; 95628e30874SSatish Balay s2 = t[1 + idt]; 95728e30874SSatish Balay s3 = t[2 + idt]; 95828e30874SSatish Balay s4 = t[3 + idt]; 95928e30874SSatish Balay while (nz--) { 96028e30874SSatish Balay idx = 4 * (*vi++); 96128e30874SSatish Balay x1 = t[idx]; 96228e30874SSatish Balay x2 = t[1 + idx]; 96328e30874SSatish Balay x3 = t[2 + idx]; 96428e30874SSatish Balay x4 = t[3 + idx]; 96528e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 96628e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 96728e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 96828e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 96928e30874SSatish Balay v += 16; 97028e30874SSatish Balay } 97128e30874SSatish Balay idc = 4 * (*c--); 97228e30874SSatish Balay v = aa + 16 * diag[i]; 97328e30874SSatish Balay t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 97428e30874SSatish Balay t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 97528e30874SSatish Balay t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 97628e30874SSatish Balay t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 97728e30874SSatish Balay x[idc] = (PetscScalar)t[idt]; 97828e30874SSatish Balay x[1 + idc] = (PetscScalar)t[1 + idt]; 97928e30874SSatish Balay x[2 + idc] = (PetscScalar)t[2 + idt]; 98028e30874SSatish Balay x[3 + idc] = (PetscScalar)t[3 + idt]; 98128e30874SSatish Balay } 98228e30874SSatish Balay 9839566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 9849566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 9859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 9869566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 9879566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 98828e30874SSatish Balay PetscFunctionReturn(0); 98928e30874SSatish Balay } 99028e30874SSatish Balay 99128e30874SSatish Balay #if defined(PETSC_HAVE_SSE) 99228e30874SSatish Balay 99328e30874SSatish Balay #include PETSC_HAVE_SSE 99428e30874SSatish Balay 9959371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx) { 99628e30874SSatish Balay /* 99728e30874SSatish Balay Note: This code uses demotion of double 99828e30874SSatish Balay to float when performing the mixed-mode computation. 99928e30874SSatish Balay This may not be numerically reasonable for all applications. 100028e30874SSatish Balay */ 100128e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 100228e30874SSatish Balay IS iscol = a->col, isrow = a->row; 100328e30874SSatish Balay PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16; 100428e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 100528e30874SSatish Balay MatScalar *aa = a->a, *v; 100628e30874SSatish Balay PetscScalar *x, *b, *t; 100728e30874SSatish Balay 100828e30874SSatish Balay /* Make space in temp stack for 16 Byte Aligned arrays */ 100928e30874SSatish Balay float ssealignedspace[11], *tmps, *tmpx; 101028e30874SSatish Balay unsigned long offset; 101128e30874SSatish Balay 101228e30874SSatish Balay PetscFunctionBegin; 101328e30874SSatish Balay SSE_SCOPE_BEGIN; 101428e30874SSatish Balay 101528e30874SSatish Balay offset = (unsigned long)ssealignedspace % 16; 101628e30874SSatish Balay if (offset) offset = (16 - offset) / 4; 101728e30874SSatish Balay tmps = &ssealignedspace[offset]; 101828e30874SSatish Balay tmpx = &ssealignedspace[offset + 4]; 101928e30874SSatish Balay PREFETCH_NTA(aa + 16 * ai[1]); 102028e30874SSatish Balay 10219566063dSJacob Faibussowitsch PetscCall(VecGetArray(bb, &b)); 10229566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 102328e30874SSatish Balay t = a->solve_work; 102428e30874SSatish Balay 10259371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 10269371c9d4SSatish Balay r = rout; 10279371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 10289371c9d4SSatish Balay c = cout + (n - 1); 102928e30874SSatish Balay 103028e30874SSatish Balay /* forward solve the lower triangular */ 103128e30874SSatish Balay idx = 4 * (*r++); 10329371c9d4SSatish Balay t[0] = b[idx]; 10339371c9d4SSatish Balay t[1] = b[1 + idx]; 10349371c9d4SSatish Balay t[2] = b[2 + idx]; 10359371c9d4SSatish Balay t[3] = b[3 + idx]; 103628e30874SSatish Balay v = aa + 16 * ai[1]; 103728e30874SSatish Balay 103828e30874SSatish Balay for (i = 1; i < n;) { 103928e30874SSatish Balay PREFETCH_NTA(&v[8]); 104028e30874SSatish Balay vi = aj + ai[i]; 104128e30874SSatish Balay nz = diag[i] - ai[i]; 104228e30874SSatish Balay idx = 4 * (*r++); 104328e30874SSatish Balay 104428e30874SSatish Balay /* Demote sum from double to float */ 104528e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]); 104628e30874SSatish Balay LOAD_PS(tmps, XMM7); 104728e30874SSatish Balay 104828e30874SSatish Balay while (nz--) { 104928e30874SSatish Balay PREFETCH_NTA(&v[16]); 105028e30874SSatish Balay idx = 4 * (*vi++); 105128e30874SSatish Balay 105228e30874SSatish Balay /* Demote solution (so far) from double to float */ 105328e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]); 105428e30874SSatish Balay 105528e30874SSatish Balay /* 4x4 Matrix-Vector product with negative accumulation: */ 105628e30874SSatish Balay SSE_INLINE_BEGIN_2(tmpx, v) 105728e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 105828e30874SSatish Balay 105928e30874SSatish Balay /* First Column */ 106028e30874SSatish Balay SSE_COPY_PS(XMM0, XMM6) 106128e30874SSatish Balay SSE_SHUFFLE(XMM0, XMM0, 0x00) 106228e30874SSatish Balay SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 106328e30874SSatish Balay SSE_SUB_PS(XMM7, XMM0) 106428e30874SSatish Balay 106528e30874SSatish Balay /* Second Column */ 106628e30874SSatish Balay SSE_COPY_PS(XMM1, XMM6) 106728e30874SSatish Balay SSE_SHUFFLE(XMM1, XMM1, 0x55) 106828e30874SSatish Balay SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 106928e30874SSatish Balay SSE_SUB_PS(XMM7, XMM1) 107028e30874SSatish Balay 107128e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 107228e30874SSatish Balay 107328e30874SSatish Balay /* Third Column */ 107428e30874SSatish Balay SSE_COPY_PS(XMM2, XMM6) 107528e30874SSatish Balay SSE_SHUFFLE(XMM2, XMM2, 0xAA) 107628e30874SSatish Balay SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 107728e30874SSatish Balay SSE_SUB_PS(XMM7, XMM2) 107828e30874SSatish Balay 107928e30874SSatish Balay /* Fourth Column */ 108028e30874SSatish Balay SSE_COPY_PS(XMM3, XMM6) 108128e30874SSatish Balay SSE_SHUFFLE(XMM3, XMM3, 0xFF) 108228e30874SSatish Balay SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 108328e30874SSatish Balay SSE_SUB_PS(XMM7, XMM3) 108428e30874SSatish Balay SSE_INLINE_END_2 108528e30874SSatish Balay 108628e30874SSatish Balay v += 16; 108728e30874SSatish Balay } 108828e30874SSatish Balay idx = 4 * i; 108928e30874SSatish Balay v = aa + 16 * ai[++i]; 109028e30874SSatish Balay PREFETCH_NTA(v); 109128e30874SSatish Balay STORE_PS(tmps, XMM7); 109228e30874SSatish Balay 109328e30874SSatish Balay /* Promote result from float to double */ 109428e30874SSatish Balay CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps); 109528e30874SSatish Balay } 109628e30874SSatish Balay /* backward solve the upper triangular */ 109728e30874SSatish Balay idt = 4 * (n - 1); 109828e30874SSatish Balay ai16 = 16 * diag[n - 1]; 109928e30874SSatish Balay v = aa + ai16 + 16; 110028e30874SSatish Balay for (i = n - 1; i >= 0;) { 110128e30874SSatish Balay PREFETCH_NTA(&v[8]); 110228e30874SSatish Balay vi = aj + diag[i] + 1; 110328e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 110428e30874SSatish Balay 110528e30874SSatish Balay /* Demote accumulator from double to float */ 110628e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]); 110728e30874SSatish Balay LOAD_PS(tmps, XMM7); 110828e30874SSatish Balay 110928e30874SSatish Balay while (nz--) { 111028e30874SSatish Balay PREFETCH_NTA(&v[16]); 111128e30874SSatish Balay idx = 4 * (*vi++); 111228e30874SSatish Balay 111328e30874SSatish Balay /* Demote solution (so far) from double to float */ 111428e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]); 111528e30874SSatish Balay 111628e30874SSatish Balay /* 4x4 Matrix-Vector Product with negative accumulation: */ 111728e30874SSatish Balay SSE_INLINE_BEGIN_2(tmpx, v) 111828e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 111928e30874SSatish Balay 112028e30874SSatish Balay /* First Column */ 112128e30874SSatish Balay SSE_COPY_PS(XMM0, XMM6) 112228e30874SSatish Balay SSE_SHUFFLE(XMM0, XMM0, 0x00) 112328e30874SSatish Balay SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 112428e30874SSatish Balay SSE_SUB_PS(XMM7, XMM0) 112528e30874SSatish Balay 112628e30874SSatish Balay /* Second Column */ 112728e30874SSatish Balay SSE_COPY_PS(XMM1, XMM6) 112828e30874SSatish Balay SSE_SHUFFLE(XMM1, XMM1, 0x55) 112928e30874SSatish Balay SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 113028e30874SSatish Balay SSE_SUB_PS(XMM7, XMM1) 113128e30874SSatish Balay 113228e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 113328e30874SSatish Balay 113428e30874SSatish Balay /* Third Column */ 113528e30874SSatish Balay SSE_COPY_PS(XMM2, XMM6) 113628e30874SSatish Balay SSE_SHUFFLE(XMM2, XMM2, 0xAA) 113728e30874SSatish Balay SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 113828e30874SSatish Balay SSE_SUB_PS(XMM7, XMM2) 113928e30874SSatish Balay 114028e30874SSatish Balay /* Fourth Column */ 114128e30874SSatish Balay SSE_COPY_PS(XMM3, XMM6) 114228e30874SSatish Balay SSE_SHUFFLE(XMM3, XMM3, 0xFF) 114328e30874SSatish Balay SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 114428e30874SSatish Balay SSE_SUB_PS(XMM7, XMM3) 114528e30874SSatish Balay SSE_INLINE_END_2 114628e30874SSatish Balay v += 16; 114728e30874SSatish Balay } 114828e30874SSatish Balay v = aa + ai16; 114928e30874SSatish Balay ai16 = 16 * diag[--i]; 115028e30874SSatish Balay PREFETCH_NTA(aa + ai16 + 16); 115128e30874SSatish Balay /* 115228e30874SSatish Balay Scale the result by the diagonal 4x4 block, 115328e30874SSatish Balay which was inverted as part of the factorization 115428e30874SSatish Balay */ 115528e30874SSatish Balay SSE_INLINE_BEGIN_3(v, tmps, aa + ai16) 115628e30874SSatish Balay /* First Column */ 115728e30874SSatish Balay SSE_COPY_PS(XMM0, XMM7) 115828e30874SSatish Balay SSE_SHUFFLE(XMM0, XMM0, 0x00) 115928e30874SSatish Balay SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 116028e30874SSatish Balay 116128e30874SSatish Balay /* Second Column */ 116228e30874SSatish Balay SSE_COPY_PS(XMM1, XMM7) 116328e30874SSatish Balay SSE_SHUFFLE(XMM1, XMM1, 0x55) 116428e30874SSatish Balay SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 116528e30874SSatish Balay SSE_ADD_PS(XMM0, XMM1) 116628e30874SSatish Balay 116728e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 116828e30874SSatish Balay 116928e30874SSatish Balay /* Third Column */ 117028e30874SSatish Balay SSE_COPY_PS(XMM2, XMM7) 117128e30874SSatish Balay SSE_SHUFFLE(XMM2, XMM2, 0xAA) 117228e30874SSatish Balay SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 117328e30874SSatish Balay SSE_ADD_PS(XMM0, XMM2) 117428e30874SSatish Balay 117528e30874SSatish Balay /* Fourth Column */ 117628e30874SSatish Balay SSE_COPY_PS(XMM3, XMM7) 117728e30874SSatish Balay SSE_SHUFFLE(XMM3, XMM3, 0xFF) 117828e30874SSatish Balay SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 117928e30874SSatish Balay SSE_ADD_PS(XMM0, XMM3) 118028e30874SSatish Balay 118128e30874SSatish Balay SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 118228e30874SSatish Balay SSE_INLINE_END_3 118328e30874SSatish Balay 118428e30874SSatish Balay /* Promote solution from float to double */ 118528e30874SSatish Balay CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps); 118628e30874SSatish Balay 118728e30874SSatish Balay /* Apply reordering to t and stream into x. */ 118828e30874SSatish Balay /* This way, x doesn't pollute the cache. */ 118928e30874SSatish Balay /* Be careful with size: 2 doubles = 4 floats! */ 119028e30874SSatish Balay idc = 4 * (*c--); 119128e30874SSatish Balay SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc]) 119228e30874SSatish Balay /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 119328e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0) 119428e30874SSatish Balay SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0) 119528e30874SSatish Balay /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 119628e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1) 119728e30874SSatish Balay SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1) 119828e30874SSatish Balay SSE_INLINE_END_2 119928e30874SSatish Balay v = aa + ai16 + 16; 120028e30874SSatish Balay idt -= 4; 120128e30874SSatish Balay } 120228e30874SSatish Balay 12039566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 12049566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 12059566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(bb, &b)); 12069566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 12079566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 120828e30874SSatish Balay SSE_SCOPE_END; 120928e30874SSatish Balay PetscFunctionReturn(0); 121028e30874SSatish Balay } 121128e30874SSatish Balay 121228e30874SSatish Balay #endif 121328e30874SSatish Balay 12149371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx) { 121528e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 121628e30874SSatish Balay IS iscol = a->col, isrow = a->row; 121728e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 121828e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 121928e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 122028e30874SSatish Balay const MatScalar *aa = a->a, *v; 122128e30874SSatish Balay PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 122228e30874SSatish Balay const PetscScalar *b; 122328e30874SSatish Balay 122428e30874SSatish Balay PetscFunctionBegin; 12259566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 12269566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 122728e30874SSatish Balay t = a->solve_work; 122828e30874SSatish Balay 12299371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 12309371c9d4SSatish Balay r = rout; 12319371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 12329371c9d4SSatish Balay c = cout + (n - 1); 123328e30874SSatish Balay 123428e30874SSatish Balay /* forward solve the lower triangular */ 123528e30874SSatish Balay idx = 3 * (*r++); 12369371c9d4SSatish Balay t[0] = b[idx]; 12379371c9d4SSatish Balay t[1] = b[1 + idx]; 12389371c9d4SSatish Balay t[2] = b[2 + idx]; 123928e30874SSatish Balay for (i = 1; i < n; i++) { 124028e30874SSatish Balay v = aa + 9 * ai[i]; 124128e30874SSatish Balay vi = aj + ai[i]; 124228e30874SSatish Balay nz = diag[i] - ai[i]; 124328e30874SSatish Balay idx = 3 * (*r++); 12449371c9d4SSatish Balay s1 = b[idx]; 12459371c9d4SSatish Balay s2 = b[1 + idx]; 12469371c9d4SSatish Balay s3 = b[2 + idx]; 124728e30874SSatish Balay while (nz--) { 124828e30874SSatish Balay idx = 3 * (*vi++); 12499371c9d4SSatish Balay x1 = t[idx]; 12509371c9d4SSatish Balay x2 = t[1 + idx]; 12519371c9d4SSatish Balay x3 = t[2 + idx]; 125228e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 125328e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 125428e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 125528e30874SSatish Balay v += 9; 125628e30874SSatish Balay } 125728e30874SSatish Balay idx = 3 * i; 12589371c9d4SSatish Balay t[idx] = s1; 12599371c9d4SSatish Balay t[1 + idx] = s2; 12609371c9d4SSatish Balay t[2 + idx] = s3; 126128e30874SSatish Balay } 126228e30874SSatish Balay /* backward solve the upper triangular */ 126328e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 126428e30874SSatish Balay v = aa + 9 * diag[i] + 9; 126528e30874SSatish Balay vi = aj + diag[i] + 1; 126628e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 126728e30874SSatish Balay idt = 3 * i; 12689371c9d4SSatish Balay s1 = t[idt]; 12699371c9d4SSatish Balay s2 = t[1 + idt]; 12709371c9d4SSatish Balay s3 = t[2 + idt]; 127128e30874SSatish Balay while (nz--) { 127228e30874SSatish Balay idx = 3 * (*vi++); 12739371c9d4SSatish Balay x1 = t[idx]; 12749371c9d4SSatish Balay x2 = t[1 + idx]; 12759371c9d4SSatish Balay x3 = t[2 + idx]; 127628e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 127728e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 127828e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 127928e30874SSatish Balay v += 9; 128028e30874SSatish Balay } 128128e30874SSatish Balay idc = 3 * (*c--); 128228e30874SSatish Balay v = aa + 9 * diag[i]; 128328e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 128428e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 128528e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 128628e30874SSatish Balay } 12879566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 12889566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 12899566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 12909566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 12919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 129228e30874SSatish Balay PetscFunctionReturn(0); 129328e30874SSatish Balay } 129428e30874SSatish Balay 12959371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx) { 129628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 129728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 129828e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 129928e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 130028e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 130128e30874SSatish Balay const MatScalar *aa = a->a, *v; 130228e30874SSatish Balay PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 130328e30874SSatish Balay const PetscScalar *b; 130428e30874SSatish Balay 130528e30874SSatish Balay PetscFunctionBegin; 13069566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 13079566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 130828e30874SSatish Balay t = a->solve_work; 130928e30874SSatish Balay 13109371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 13119371c9d4SSatish Balay r = rout; 13129371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 13139371c9d4SSatish Balay c = cout; 131428e30874SSatish Balay 131528e30874SSatish Balay /* forward solve the lower triangular */ 131628e30874SSatish Balay idx = 3 * r[0]; 13179371c9d4SSatish Balay t[0] = b[idx]; 13189371c9d4SSatish Balay t[1] = b[1 + idx]; 13199371c9d4SSatish Balay t[2] = b[2 + idx]; 132028e30874SSatish Balay for (i = 1; i < n; i++) { 132128e30874SSatish Balay v = aa + 9 * ai[i]; 132228e30874SSatish Balay vi = aj + ai[i]; 132328e30874SSatish Balay nz = ai[i + 1] - ai[i]; 132428e30874SSatish Balay idx = 3 * r[i]; 13259371c9d4SSatish Balay s1 = b[idx]; 13269371c9d4SSatish Balay s2 = b[1 + idx]; 13279371c9d4SSatish Balay s3 = b[2 + idx]; 132828e30874SSatish Balay for (m = 0; m < nz; m++) { 132928e30874SSatish Balay idx = 3 * vi[m]; 13309371c9d4SSatish Balay x1 = t[idx]; 13319371c9d4SSatish Balay x2 = t[1 + idx]; 13329371c9d4SSatish Balay x3 = t[2 + idx]; 133328e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 133428e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 133528e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 133628e30874SSatish Balay v += 9; 133728e30874SSatish Balay } 133828e30874SSatish Balay idx = 3 * i; 13399371c9d4SSatish Balay t[idx] = s1; 13409371c9d4SSatish Balay t[1 + idx] = s2; 13419371c9d4SSatish Balay t[2 + idx] = s3; 134228e30874SSatish Balay } 134328e30874SSatish Balay /* backward solve the upper triangular */ 134428e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 134528e30874SSatish Balay v = aa + 9 * (adiag[i + 1] + 1); 134628e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 134728e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 134828e30874SSatish Balay idt = 3 * i; 13499371c9d4SSatish Balay s1 = t[idt]; 13509371c9d4SSatish Balay s2 = t[1 + idt]; 13519371c9d4SSatish Balay s3 = t[2 + idt]; 135228e30874SSatish Balay for (m = 0; m < nz; m++) { 135328e30874SSatish Balay idx = 3 * vi[m]; 13549371c9d4SSatish Balay x1 = t[idx]; 13559371c9d4SSatish Balay x2 = t[1 + idx]; 13569371c9d4SSatish Balay x3 = t[2 + idx]; 135728e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 135828e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 135928e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 136028e30874SSatish Balay v += 9; 136128e30874SSatish Balay } 136228e30874SSatish Balay idc = 3 * c[i]; 136328e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 136428e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 136528e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 136628e30874SSatish Balay } 13679566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 13689566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 13699566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 13709566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 13719566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 137228e30874SSatish Balay PetscFunctionReturn(0); 137328e30874SSatish Balay } 137428e30874SSatish Balay 13759371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx) { 137628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 137728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 137828e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 137928e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 138028e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 138128e30874SSatish Balay const MatScalar *aa = a->a, *v; 138228e30874SSatish Balay PetscScalar *x, s1, s2, x1, x2, *t; 138328e30874SSatish Balay const PetscScalar *b; 138428e30874SSatish Balay 138528e30874SSatish Balay PetscFunctionBegin; 13869566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 13879566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 138828e30874SSatish Balay t = a->solve_work; 138928e30874SSatish Balay 13909371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 13919371c9d4SSatish Balay r = rout; 13929371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 13939371c9d4SSatish Balay c = cout + (n - 1); 139428e30874SSatish Balay 139528e30874SSatish Balay /* forward solve the lower triangular */ 139628e30874SSatish Balay idx = 2 * (*r++); 13979371c9d4SSatish Balay t[0] = b[idx]; 13989371c9d4SSatish Balay t[1] = b[1 + idx]; 139928e30874SSatish Balay for (i = 1; i < n; i++) { 140028e30874SSatish Balay v = aa + 4 * ai[i]; 140128e30874SSatish Balay vi = aj + ai[i]; 140228e30874SSatish Balay nz = diag[i] - ai[i]; 140328e30874SSatish Balay idx = 2 * (*r++); 14049371c9d4SSatish Balay s1 = b[idx]; 14059371c9d4SSatish Balay s2 = b[1 + idx]; 140628e30874SSatish Balay while (nz--) { 140728e30874SSatish Balay idx = 2 * (*vi++); 14089371c9d4SSatish Balay x1 = t[idx]; 14099371c9d4SSatish Balay x2 = t[1 + idx]; 141028e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 141128e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 141228e30874SSatish Balay v += 4; 141328e30874SSatish Balay } 141428e30874SSatish Balay idx = 2 * i; 14159371c9d4SSatish Balay t[idx] = s1; 14169371c9d4SSatish Balay t[1 + idx] = s2; 141728e30874SSatish Balay } 141828e30874SSatish Balay /* backward solve the upper triangular */ 141928e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 142028e30874SSatish Balay v = aa + 4 * diag[i] + 4; 142128e30874SSatish Balay vi = aj + diag[i] + 1; 142228e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 142328e30874SSatish Balay idt = 2 * i; 14249371c9d4SSatish Balay s1 = t[idt]; 14259371c9d4SSatish Balay s2 = t[1 + idt]; 142628e30874SSatish Balay while (nz--) { 142728e30874SSatish Balay idx = 2 * (*vi++); 14289371c9d4SSatish Balay x1 = t[idx]; 14299371c9d4SSatish Balay x2 = t[1 + idx]; 143028e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 143128e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 143228e30874SSatish Balay v += 4; 143328e30874SSatish Balay } 143428e30874SSatish Balay idc = 2 * (*c--); 143528e30874SSatish Balay v = aa + 4 * diag[i]; 143628e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 143728e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 143828e30874SSatish Balay } 14399566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 14409566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 14419566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 14429566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 14439566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 144428e30874SSatish Balay PetscFunctionReturn(0); 144528e30874SSatish Balay } 144628e30874SSatish Balay 14479371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx) { 144828e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 144928e30874SSatish Balay IS iscol = a->col, isrow = a->row; 145028e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 145128e30874SSatish Balay PetscInt i, nz, idx, jdx, idt, idc, m; 145228e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 145328e30874SSatish Balay const MatScalar *aa = a->a, *v; 145428e30874SSatish Balay PetscScalar *x, s1, s2, x1, x2, *t; 145528e30874SSatish Balay const PetscScalar *b; 145628e30874SSatish Balay 145728e30874SSatish Balay PetscFunctionBegin; 14589566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 14599566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 146028e30874SSatish Balay t = a->solve_work; 146128e30874SSatish Balay 14629371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 14639371c9d4SSatish Balay r = rout; 14649371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 14659371c9d4SSatish Balay c = cout; 146628e30874SSatish Balay 146728e30874SSatish Balay /* forward solve the lower triangular */ 146828e30874SSatish Balay idx = 2 * r[0]; 14699371c9d4SSatish Balay t[0] = b[idx]; 14709371c9d4SSatish Balay t[1] = b[1 + idx]; 147128e30874SSatish Balay for (i = 1; i < n; i++) { 147228e30874SSatish Balay v = aa + 4 * ai[i]; 147328e30874SSatish Balay vi = aj + ai[i]; 147428e30874SSatish Balay nz = ai[i + 1] - ai[i]; 147528e30874SSatish Balay idx = 2 * r[i]; 14769371c9d4SSatish Balay s1 = b[idx]; 14779371c9d4SSatish Balay s2 = b[1 + idx]; 147828e30874SSatish Balay for (m = 0; m < nz; m++) { 147928e30874SSatish Balay jdx = 2 * vi[m]; 14809371c9d4SSatish Balay x1 = t[jdx]; 14819371c9d4SSatish Balay x2 = t[1 + jdx]; 148228e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 148328e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 148428e30874SSatish Balay v += 4; 148528e30874SSatish Balay } 148628e30874SSatish Balay idx = 2 * i; 14879371c9d4SSatish Balay t[idx] = s1; 14889371c9d4SSatish Balay t[1 + idx] = s2; 148928e30874SSatish Balay } 149028e30874SSatish Balay /* backward solve the upper triangular */ 149128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 149228e30874SSatish Balay v = aa + 4 * (adiag[i + 1] + 1); 149328e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 149428e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 149528e30874SSatish Balay idt = 2 * i; 14969371c9d4SSatish Balay s1 = t[idt]; 14979371c9d4SSatish Balay s2 = t[1 + idt]; 149828e30874SSatish Balay for (m = 0; m < nz; m++) { 149928e30874SSatish Balay idx = 2 * vi[m]; 15009371c9d4SSatish Balay x1 = t[idx]; 15019371c9d4SSatish Balay x2 = t[1 + idx]; 150228e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 150328e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 150428e30874SSatish Balay v += 4; 150528e30874SSatish Balay } 150628e30874SSatish Balay idc = 2 * c[i]; 150728e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 150828e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 150928e30874SSatish Balay } 15109566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 15119566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 15129566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 15139566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 15149566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 151528e30874SSatish Balay PetscFunctionReturn(0); 151628e30874SSatish Balay } 151728e30874SSatish Balay 15189371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx) { 151928e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 152028e30874SSatish Balay IS iscol = a->col, isrow = a->row; 152128e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 152228e30874SSatish Balay PetscInt i, nz; 152328e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 152428e30874SSatish Balay const MatScalar *aa = a->a, *v; 152528e30874SSatish Balay PetscScalar *x, s1, *t; 152628e30874SSatish Balay const PetscScalar *b; 152728e30874SSatish Balay 152828e30874SSatish Balay PetscFunctionBegin; 152928e30874SSatish Balay if (!n) PetscFunctionReturn(0); 153028e30874SSatish Balay 15319566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 15329566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 153328e30874SSatish Balay t = a->solve_work; 153428e30874SSatish Balay 15359371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 15369371c9d4SSatish Balay r = rout; 15379371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 15389371c9d4SSatish Balay c = cout + (n - 1); 153928e30874SSatish Balay 154028e30874SSatish Balay /* forward solve the lower triangular */ 154128e30874SSatish Balay t[0] = b[*r++]; 154228e30874SSatish Balay for (i = 1; i < n; i++) { 154328e30874SSatish Balay v = aa + ai[i]; 154428e30874SSatish Balay vi = aj + ai[i]; 154528e30874SSatish Balay nz = diag[i] - ai[i]; 154628e30874SSatish Balay s1 = b[*r++]; 1547*ad540459SPierre Jolivet while (nz--) s1 -= (*v++) * t[*vi++]; 154828e30874SSatish Balay t[i] = s1; 154928e30874SSatish Balay } 155028e30874SSatish Balay /* backward solve the upper triangular */ 155128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 155228e30874SSatish Balay v = aa + diag[i] + 1; 155328e30874SSatish Balay vi = aj + diag[i] + 1; 155428e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 155528e30874SSatish Balay s1 = t[i]; 1556*ad540459SPierre Jolivet while (nz--) s1 -= (*v++) * t[*vi++]; 155728e30874SSatish Balay x[*c--] = t[i] = aa[diag[i]] * s1; 155828e30874SSatish Balay } 155928e30874SSatish Balay 15609566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 15619566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 15629566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 15639566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 15649566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n)); 156528e30874SSatish Balay PetscFunctionReturn(0); 156628e30874SSatish Balay } 156728e30874SSatish Balay 15689371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx) { 156928e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 157028e30874SSatish Balay IS iscol = a->col, isrow = a->row; 157128e30874SSatish Balay PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz; 157228e30874SSatish Balay const PetscInt *rout, *cout, *r, *c; 157328e30874SSatish Balay PetscScalar *x, *tmp, sum; 157428e30874SSatish Balay const PetscScalar *b; 157528e30874SSatish Balay const MatScalar *aa = a->a, *v; 157628e30874SSatish Balay 157728e30874SSatish Balay PetscFunctionBegin; 157828e30874SSatish Balay if (!n) PetscFunctionReturn(0); 157928e30874SSatish Balay 15809566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 15819566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 158228e30874SSatish Balay tmp = a->solve_work; 158328e30874SSatish Balay 15849371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 15859371c9d4SSatish Balay r = rout; 15869371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 15879371c9d4SSatish Balay c = cout; 158828e30874SSatish Balay 158928e30874SSatish Balay /* forward solve the lower triangular */ 159028e30874SSatish Balay tmp[0] = b[r[0]]; 159128e30874SSatish Balay v = aa; 159228e30874SSatish Balay vi = aj; 159328e30874SSatish Balay for (i = 1; i < n; i++) { 159428e30874SSatish Balay nz = ai[i + 1] - ai[i]; 159528e30874SSatish Balay sum = b[r[i]]; 159628e30874SSatish Balay PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 159728e30874SSatish Balay tmp[i] = sum; 15989371c9d4SSatish Balay v += nz; 15999371c9d4SSatish Balay vi += nz; 160028e30874SSatish Balay } 160128e30874SSatish Balay 160228e30874SSatish Balay /* backward solve the upper triangular */ 160328e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 160428e30874SSatish Balay v = aa + adiag[i + 1] + 1; 160528e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 160628e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 160728e30874SSatish Balay sum = tmp[i]; 160828e30874SSatish Balay PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 160928e30874SSatish Balay x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */ 161028e30874SSatish Balay } 161128e30874SSatish Balay 16129566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 16139566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 16149566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 16159566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 16169566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 161728e30874SSatish Balay PetscFunctionReturn(0); 161828e30874SSatish Balay } 1619