128e30874SSatish Balay #include <../src/mat/impls/baij/seq/baij.h> 2af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 328e30874SSatish Balay 4d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 5d71ae5a4SJacob Faibussowitsch { 628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 828e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 928e30874SSatish Balay const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *vi; 1028e30874SSatish Balay PetscInt i, nz; 1128e30874SSatish Balay const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 1228e30874SSatish Balay const MatScalar *aa = a->a, *v; 1328e30874SSatish Balay PetscScalar *x, *s, *t, *ls; 1428e30874SSatish Balay const PetscScalar *b; 1528e30874SSatish Balay 1628e30874SSatish Balay PetscFunctionBegin; 179566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 189566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 1928e30874SSatish Balay t = a->solve_work; 2028e30874SSatish Balay 219371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 229371c9d4SSatish Balay r = rout; 239371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 249371c9d4SSatish Balay c = cout + (n - 1); 2528e30874SSatish Balay 2628e30874SSatish Balay /* forward solve the lower triangular */ 279566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t, b + bs * (*r++), bs)); 2828e30874SSatish Balay for (i = 1; i < n; i++) { 2928e30874SSatish Balay v = aa + bs2 * ai[i]; 3028e30874SSatish Balay vi = aj + ai[i]; 3128e30874SSatish Balay nz = a->diag[i] - ai[i]; 3228e30874SSatish Balay s = t + bs * i; 339566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(s, b + bs * (*r++), bs)); 3428e30874SSatish Balay while (nz--) { 3528e30874SSatish Balay PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++)); 3628e30874SSatish Balay v += bs2; 3728e30874SSatish Balay } 3828e30874SSatish Balay } 3928e30874SSatish Balay /* backward solve the upper triangular */ 4028e30874SSatish Balay ls = a->solve_work + A->cmap->n; 4128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 4228e30874SSatish Balay v = aa + bs2 * (a->diag[i] + 1); 4328e30874SSatish Balay vi = aj + a->diag[i] + 1; 4428e30874SSatish Balay nz = ai[i + 1] - a->diag[i] - 1; 459566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 4628e30874SSatish Balay while (nz--) { 4728e30874SSatish Balay PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++)); 4828e30874SSatish Balay v += bs2; 4928e30874SSatish Balay } 5028e30874SSatish Balay PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs); 519566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs)); 5228e30874SSatish Balay } 5328e30874SSatish Balay 549566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 559566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 569566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 579566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 589566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 59*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6028e30874SSatish Balay } 6128e30874SSatish Balay 62d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx) 63d71ae5a4SJacob Faibussowitsch { 6428e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6528e30874SSatish Balay IS iscol = a->col, isrow = a->row; 6628e30874SSatish Balay const PetscInt *r, *c, *ai = a->i, *aj = a->j; 6728e30874SSatish Balay const PetscInt *rout, *cout, *diag = a->diag, *vi, n = a->mbs; 6828e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 6928e30874SSatish Balay const MatScalar *aa = a->a, *v; 7028e30874SSatish Balay PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 7128e30874SSatish Balay const PetscScalar *b; 7228e30874SSatish Balay 7328e30874SSatish Balay PetscFunctionBegin; 749566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 759566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 7628e30874SSatish Balay t = a->solve_work; 7728e30874SSatish Balay 789371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 799371c9d4SSatish Balay r = rout; 809371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 819371c9d4SSatish Balay c = cout + (n - 1); 8228e30874SSatish Balay 8328e30874SSatish Balay /* forward solve the lower triangular */ 8428e30874SSatish Balay idx = 7 * (*r++); 859371c9d4SSatish Balay t[0] = b[idx]; 869371c9d4SSatish Balay t[1] = b[1 + idx]; 879371c9d4SSatish Balay t[2] = b[2 + idx]; 889371c9d4SSatish Balay t[3] = b[3 + idx]; 899371c9d4SSatish Balay t[4] = b[4 + idx]; 909371c9d4SSatish Balay t[5] = b[5 + idx]; 919371c9d4SSatish Balay t[6] = b[6 + idx]; 9228e30874SSatish Balay 9328e30874SSatish Balay for (i = 1; i < n; i++) { 9428e30874SSatish Balay v = aa + 49 * ai[i]; 9528e30874SSatish Balay vi = aj + ai[i]; 9628e30874SSatish Balay nz = diag[i] - ai[i]; 9728e30874SSatish Balay idx = 7 * (*r++); 989371c9d4SSatish Balay s1 = b[idx]; 999371c9d4SSatish Balay s2 = b[1 + idx]; 1009371c9d4SSatish Balay s3 = b[2 + idx]; 1019371c9d4SSatish Balay s4 = b[3 + idx]; 1029371c9d4SSatish Balay s5 = b[4 + idx]; 1039371c9d4SSatish Balay s6 = b[5 + idx]; 1049371c9d4SSatish Balay s7 = b[6 + idx]; 10528e30874SSatish Balay while (nz--) { 10628e30874SSatish Balay idx = 7 * (*vi++); 1079371c9d4SSatish Balay x1 = t[idx]; 1089371c9d4SSatish Balay x2 = t[1 + idx]; 1099371c9d4SSatish Balay x3 = t[2 + idx]; 1109371c9d4SSatish Balay x4 = t[3 + idx]; 1119371c9d4SSatish Balay x5 = t[4 + idx]; 1129371c9d4SSatish Balay x6 = t[5 + idx]; 1139371c9d4SSatish Balay x7 = t[6 + idx]; 11428e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 11528e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 11628e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 11728e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 11828e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 11928e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 12028e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 12128e30874SSatish Balay v += 49; 12228e30874SSatish Balay } 12328e30874SSatish Balay idx = 7 * i; 1249371c9d4SSatish Balay t[idx] = s1; 1259371c9d4SSatish Balay t[1 + idx] = s2; 1269371c9d4SSatish Balay t[2 + idx] = s3; 1279371c9d4SSatish Balay t[3 + idx] = s4; 1289371c9d4SSatish Balay t[4 + idx] = s5; 1299371c9d4SSatish Balay t[5 + idx] = s6; 1309371c9d4SSatish Balay t[6 + idx] = s7; 13128e30874SSatish Balay } 13228e30874SSatish Balay /* backward solve the upper triangular */ 13328e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 13428e30874SSatish Balay v = aa + 49 * diag[i] + 49; 13528e30874SSatish Balay vi = aj + diag[i] + 1; 13628e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 13728e30874SSatish Balay idt = 7 * i; 1389371c9d4SSatish Balay s1 = t[idt]; 1399371c9d4SSatish Balay s2 = t[1 + idt]; 1409371c9d4SSatish Balay s3 = t[2 + idt]; 1419371c9d4SSatish Balay s4 = t[3 + idt]; 1429371c9d4SSatish Balay s5 = t[4 + idt]; 1439371c9d4SSatish Balay s6 = t[5 + idt]; 1449371c9d4SSatish Balay s7 = t[6 + idt]; 14528e30874SSatish Balay while (nz--) { 14628e30874SSatish Balay idx = 7 * (*vi++); 1479371c9d4SSatish Balay x1 = t[idx]; 1489371c9d4SSatish Balay x2 = t[1 + idx]; 1499371c9d4SSatish Balay x3 = t[2 + idx]; 1509371c9d4SSatish Balay x4 = t[3 + idx]; 1519371c9d4SSatish Balay x5 = t[4 + idx]; 1529371c9d4SSatish Balay x6 = t[5 + idx]; 1539371c9d4SSatish Balay x7 = t[6 + idx]; 15428e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 15528e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 15628e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 15728e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 15828e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 15928e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 16028e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 16128e30874SSatish Balay v += 49; 16228e30874SSatish Balay } 16328e30874SSatish Balay idc = 7 * (*c--); 16428e30874SSatish Balay v = aa + 49 * diag[i]; 1659371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 1669371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 1679371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 1689371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 1699371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 1709371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 1719371c9d4SSatish Balay x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 17228e30874SSatish Balay } 17328e30874SSatish Balay 1749566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 1759566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 1769566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 1779566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 1789566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 179*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18028e30874SSatish Balay } 18128e30874SSatish Balay 182d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx) 183d71ae5a4SJacob Faibussowitsch { 18428e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 18528e30874SSatish Balay IS iscol = a->col, isrow = a->row; 18628e30874SSatish Balay const PetscInt *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag; 18728e30874SSatish Balay const PetscInt n = a->mbs, *rout, *cout, *vi; 18828e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 18928e30874SSatish Balay const MatScalar *aa = a->a, *v; 19028e30874SSatish Balay PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 19128e30874SSatish Balay const PetscScalar *b; 19228e30874SSatish Balay 19328e30874SSatish Balay PetscFunctionBegin; 1949566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 1959566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 19628e30874SSatish Balay t = a->solve_work; 19728e30874SSatish Balay 1989371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 1999371c9d4SSatish Balay r = rout; 2009371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 2019371c9d4SSatish Balay c = cout; 20228e30874SSatish Balay 20328e30874SSatish Balay /* forward solve the lower triangular */ 20428e30874SSatish Balay idx = 7 * r[0]; 2059371c9d4SSatish Balay t[0] = b[idx]; 2069371c9d4SSatish Balay t[1] = b[1 + idx]; 2079371c9d4SSatish Balay t[2] = b[2 + idx]; 2089371c9d4SSatish Balay t[3] = b[3 + idx]; 2099371c9d4SSatish Balay t[4] = b[4 + idx]; 2109371c9d4SSatish Balay t[5] = b[5 + idx]; 2119371c9d4SSatish Balay t[6] = b[6 + idx]; 21228e30874SSatish Balay 21328e30874SSatish Balay for (i = 1; i < n; i++) { 21428e30874SSatish Balay v = aa + 49 * ai[i]; 21528e30874SSatish Balay vi = aj + ai[i]; 21628e30874SSatish Balay nz = ai[i + 1] - ai[i]; 21728e30874SSatish Balay idx = 7 * r[i]; 2189371c9d4SSatish Balay s1 = b[idx]; 2199371c9d4SSatish Balay s2 = b[1 + idx]; 2209371c9d4SSatish Balay s3 = b[2 + idx]; 2219371c9d4SSatish Balay s4 = b[3 + idx]; 2229371c9d4SSatish Balay s5 = b[4 + idx]; 2239371c9d4SSatish Balay s6 = b[5 + idx]; 2249371c9d4SSatish Balay s7 = b[6 + idx]; 22528e30874SSatish Balay for (m = 0; m < nz; m++) { 22628e30874SSatish Balay idx = 7 * vi[m]; 2279371c9d4SSatish Balay x1 = t[idx]; 2289371c9d4SSatish Balay x2 = t[1 + idx]; 2299371c9d4SSatish Balay x3 = t[2 + idx]; 2309371c9d4SSatish Balay x4 = t[3 + idx]; 2319371c9d4SSatish Balay x5 = t[4 + idx]; 2329371c9d4SSatish Balay x6 = t[5 + idx]; 2339371c9d4SSatish Balay x7 = t[6 + idx]; 23428e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 23528e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 23628e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 23728e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 23828e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 23928e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 24028e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 24128e30874SSatish Balay v += 49; 24228e30874SSatish Balay } 24328e30874SSatish Balay idx = 7 * i; 2449371c9d4SSatish Balay t[idx] = s1; 2459371c9d4SSatish Balay t[1 + idx] = s2; 2469371c9d4SSatish Balay t[2 + idx] = s3; 2479371c9d4SSatish Balay t[3 + idx] = s4; 2489371c9d4SSatish Balay t[4 + idx] = s5; 2499371c9d4SSatish Balay t[5 + idx] = s6; 2509371c9d4SSatish Balay t[6 + idx] = s7; 25128e30874SSatish Balay } 25228e30874SSatish Balay /* backward solve the upper triangular */ 25328e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 25428e30874SSatish Balay v = aa + 49 * (adiag[i + 1] + 1); 25528e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 25628e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 25728e30874SSatish Balay idt = 7 * i; 2589371c9d4SSatish Balay s1 = t[idt]; 2599371c9d4SSatish Balay s2 = t[1 + idt]; 2609371c9d4SSatish Balay s3 = t[2 + idt]; 2619371c9d4SSatish Balay s4 = t[3 + idt]; 2629371c9d4SSatish Balay s5 = t[4 + idt]; 2639371c9d4SSatish Balay s6 = t[5 + idt]; 2649371c9d4SSatish Balay s7 = t[6 + idt]; 26528e30874SSatish Balay for (m = 0; m < nz; m++) { 26628e30874SSatish Balay idx = 7 * vi[m]; 2679371c9d4SSatish Balay x1 = t[idx]; 2689371c9d4SSatish Balay x2 = t[1 + idx]; 2699371c9d4SSatish Balay x3 = t[2 + idx]; 2709371c9d4SSatish Balay x4 = t[3 + idx]; 2719371c9d4SSatish Balay x5 = t[4 + idx]; 2729371c9d4SSatish Balay x6 = t[5 + idx]; 2739371c9d4SSatish Balay x7 = t[6 + idx]; 27428e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 27528e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 27628e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 27728e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 27828e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 27928e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 28028e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 28128e30874SSatish Balay v += 49; 28228e30874SSatish Balay } 28328e30874SSatish Balay idc = 7 * c[i]; 2849371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 2859371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 2869371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 2879371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 2889371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 2899371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 2909371c9d4SSatish Balay x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 29128e30874SSatish Balay } 29228e30874SSatish Balay 2939566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 2949566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 2959566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 2969566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 2979566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 298*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 29928e30874SSatish Balay } 30028e30874SSatish Balay 301d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx) 302d71ae5a4SJacob Faibussowitsch { 30328e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 30428e30874SSatish Balay IS iscol = a->col, isrow = a->row; 30528e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 30628e30874SSatish Balay const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 30728e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 30828e30874SSatish Balay const MatScalar *aa = a->a, *v; 30928e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 31028e30874SSatish Balay const PetscScalar *b; 31128e30874SSatish Balay 31228e30874SSatish Balay PetscFunctionBegin; 3139566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 3149566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 31528e30874SSatish Balay t = a->solve_work; 31628e30874SSatish Balay 3179371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 3189371c9d4SSatish Balay r = rout; 3199371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 3209371c9d4SSatish Balay c = cout + (n - 1); 32128e30874SSatish Balay 32228e30874SSatish Balay /* forward solve the lower triangular */ 32328e30874SSatish Balay idx = 6 * (*r++); 3249371c9d4SSatish Balay t[0] = b[idx]; 3259371c9d4SSatish Balay t[1] = b[1 + idx]; 3269371c9d4SSatish Balay t[2] = b[2 + idx]; 3279371c9d4SSatish Balay t[3] = b[3 + idx]; 3289371c9d4SSatish Balay t[4] = b[4 + idx]; 3299371c9d4SSatish Balay t[5] = b[5 + idx]; 33028e30874SSatish Balay for (i = 1; i < n; i++) { 33128e30874SSatish Balay v = aa + 36 * ai[i]; 33228e30874SSatish Balay vi = aj + ai[i]; 33328e30874SSatish Balay nz = diag[i] - ai[i]; 33428e30874SSatish Balay idx = 6 * (*r++); 3359371c9d4SSatish Balay s1 = b[idx]; 3369371c9d4SSatish Balay s2 = b[1 + idx]; 3379371c9d4SSatish Balay s3 = b[2 + idx]; 3389371c9d4SSatish Balay s4 = b[3 + idx]; 3399371c9d4SSatish Balay s5 = b[4 + idx]; 3409371c9d4SSatish Balay s6 = b[5 + idx]; 34128e30874SSatish Balay while (nz--) { 34228e30874SSatish Balay idx = 6 * (*vi++); 3439371c9d4SSatish Balay x1 = t[idx]; 3449371c9d4SSatish Balay x2 = t[1 + idx]; 3459371c9d4SSatish Balay x3 = t[2 + idx]; 3469371c9d4SSatish Balay x4 = t[3 + idx]; 3479371c9d4SSatish Balay x5 = t[4 + idx]; 3489371c9d4SSatish Balay x6 = t[5 + idx]; 34928e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 35028e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 35128e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 35228e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 35328e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 35428e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 35528e30874SSatish Balay v += 36; 35628e30874SSatish Balay } 35728e30874SSatish Balay idx = 6 * i; 3589371c9d4SSatish Balay t[idx] = s1; 3599371c9d4SSatish Balay t[1 + idx] = s2; 3609371c9d4SSatish Balay t[2 + idx] = s3; 3619371c9d4SSatish Balay t[3 + idx] = s4; 3629371c9d4SSatish Balay t[4 + idx] = s5; 3639371c9d4SSatish Balay t[5 + idx] = s6; 36428e30874SSatish Balay } 36528e30874SSatish Balay /* backward solve the upper triangular */ 36628e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 36728e30874SSatish Balay v = aa + 36 * diag[i] + 36; 36828e30874SSatish Balay vi = aj + diag[i] + 1; 36928e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 37028e30874SSatish Balay idt = 6 * i; 3719371c9d4SSatish Balay s1 = t[idt]; 3729371c9d4SSatish Balay s2 = t[1 + idt]; 3739371c9d4SSatish Balay s3 = t[2 + idt]; 3749371c9d4SSatish Balay s4 = t[3 + idt]; 3759371c9d4SSatish Balay s5 = t[4 + idt]; 3769371c9d4SSatish Balay s6 = t[5 + idt]; 37728e30874SSatish Balay while (nz--) { 37828e30874SSatish Balay idx = 6 * (*vi++); 3799371c9d4SSatish Balay x1 = t[idx]; 3809371c9d4SSatish Balay x2 = t[1 + idx]; 3819371c9d4SSatish Balay x3 = t[2 + idx]; 3829371c9d4SSatish Balay x4 = t[3 + idx]; 3839371c9d4SSatish Balay x5 = t[4 + idx]; 3849371c9d4SSatish Balay x6 = t[5 + idx]; 38528e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 38628e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 38728e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 38828e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 38928e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 39028e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 39128e30874SSatish Balay v += 36; 39228e30874SSatish Balay } 39328e30874SSatish Balay idc = 6 * (*c--); 39428e30874SSatish Balay v = aa + 36 * diag[i]; 3959371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 3969371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 3979371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 3989371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 3999371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 4009371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 40128e30874SSatish Balay } 40228e30874SSatish Balay 4039566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 4049566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 4059566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 4069566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 4079566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 408*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 40928e30874SSatish Balay } 41028e30874SSatish Balay 411d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx) 412d71ae5a4SJacob Faibussowitsch { 41328e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 41428e30874SSatish Balay IS iscol = a->col, isrow = a->row; 41528e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 41628e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 41728e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 41828e30874SSatish Balay const MatScalar *aa = a->a, *v; 41928e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 42028e30874SSatish Balay const PetscScalar *b; 42128e30874SSatish Balay 42228e30874SSatish Balay PetscFunctionBegin; 4239566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 4249566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 42528e30874SSatish Balay t = a->solve_work; 42628e30874SSatish Balay 4279371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 4289371c9d4SSatish Balay r = rout; 4299371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 4309371c9d4SSatish Balay c = cout; 43128e30874SSatish Balay 43228e30874SSatish Balay /* forward solve the lower triangular */ 43328e30874SSatish Balay idx = 6 * r[0]; 4349371c9d4SSatish Balay t[0] = b[idx]; 4359371c9d4SSatish Balay t[1] = b[1 + idx]; 4369371c9d4SSatish Balay t[2] = b[2 + idx]; 4379371c9d4SSatish Balay t[3] = b[3 + idx]; 4389371c9d4SSatish Balay t[4] = b[4 + idx]; 4399371c9d4SSatish Balay t[5] = b[5 + idx]; 44028e30874SSatish Balay for (i = 1; i < n; i++) { 44128e30874SSatish Balay v = aa + 36 * ai[i]; 44228e30874SSatish Balay vi = aj + ai[i]; 44328e30874SSatish Balay nz = ai[i + 1] - ai[i]; 44428e30874SSatish Balay idx = 6 * r[i]; 4459371c9d4SSatish Balay s1 = b[idx]; 4469371c9d4SSatish Balay s2 = b[1 + idx]; 4479371c9d4SSatish Balay s3 = b[2 + idx]; 4489371c9d4SSatish Balay s4 = b[3 + idx]; 4499371c9d4SSatish Balay s5 = b[4 + idx]; 4509371c9d4SSatish Balay s6 = b[5 + idx]; 45128e30874SSatish Balay for (m = 0; m < nz; m++) { 45228e30874SSatish Balay idx = 6 * vi[m]; 4539371c9d4SSatish Balay x1 = t[idx]; 4549371c9d4SSatish Balay x2 = t[1 + idx]; 4559371c9d4SSatish Balay x3 = t[2 + idx]; 4569371c9d4SSatish Balay x4 = t[3 + idx]; 4579371c9d4SSatish Balay x5 = t[4 + idx]; 4589371c9d4SSatish Balay x6 = t[5 + idx]; 45928e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 46028e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 46128e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 46228e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 46328e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 46428e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 46528e30874SSatish Balay v += 36; 46628e30874SSatish Balay } 46728e30874SSatish Balay idx = 6 * i; 4689371c9d4SSatish Balay t[idx] = s1; 4699371c9d4SSatish Balay t[1 + idx] = s2; 4709371c9d4SSatish Balay t[2 + idx] = s3; 4719371c9d4SSatish Balay t[3 + idx] = s4; 4729371c9d4SSatish Balay t[4 + idx] = s5; 4739371c9d4SSatish Balay t[5 + idx] = s6; 47428e30874SSatish Balay } 47528e30874SSatish Balay /* backward solve the upper triangular */ 47628e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 47728e30874SSatish Balay v = aa + 36 * (adiag[i + 1] + 1); 47828e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 47928e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 48028e30874SSatish Balay idt = 6 * i; 4819371c9d4SSatish Balay s1 = t[idt]; 4829371c9d4SSatish Balay s2 = t[1 + idt]; 4839371c9d4SSatish Balay s3 = t[2 + idt]; 4849371c9d4SSatish Balay s4 = t[3 + idt]; 4859371c9d4SSatish Balay s5 = t[4 + idt]; 4869371c9d4SSatish Balay s6 = t[5 + idt]; 48728e30874SSatish Balay for (m = 0; m < nz; m++) { 48828e30874SSatish Balay idx = 6 * vi[m]; 4899371c9d4SSatish Balay x1 = t[idx]; 4909371c9d4SSatish Balay x2 = t[1 + idx]; 4919371c9d4SSatish Balay x3 = t[2 + idx]; 4929371c9d4SSatish Balay x4 = t[3 + idx]; 4939371c9d4SSatish Balay x5 = t[4 + idx]; 4949371c9d4SSatish Balay x6 = t[5 + idx]; 49528e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 49628e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 49728e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 49828e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 49928e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 50028e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 50128e30874SSatish Balay v += 36; 50228e30874SSatish Balay } 50328e30874SSatish Balay idc = 6 * c[i]; 5049371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 5059371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 5069371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 5079371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 5089371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 5099371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 51028e30874SSatish Balay } 51128e30874SSatish Balay 5129566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 5139566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 5149566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 5159566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 5169566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 517*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 51828e30874SSatish Balay } 51928e30874SSatish Balay 520d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx) 521d71ae5a4SJacob Faibussowitsch { 52228e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 52328e30874SSatish Balay IS iscol = a->col, isrow = a->row; 52428e30874SSatish Balay const PetscInt *r, *c, *rout, *cout, *diag = a->diag; 52528e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 52628e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 52728e30874SSatish Balay const MatScalar *aa = a->a, *v; 52828e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 52928e30874SSatish Balay const PetscScalar *b; 53028e30874SSatish Balay 53128e30874SSatish Balay PetscFunctionBegin; 5329566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 5339566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 53428e30874SSatish Balay t = a->solve_work; 53528e30874SSatish Balay 5369371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 5379371c9d4SSatish Balay r = rout; 5389371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 5399371c9d4SSatish Balay c = cout + (n - 1); 54028e30874SSatish Balay 54128e30874SSatish Balay /* forward solve the lower triangular */ 54228e30874SSatish Balay idx = 5 * (*r++); 5439371c9d4SSatish Balay t[0] = b[idx]; 5449371c9d4SSatish Balay t[1] = b[1 + idx]; 5459371c9d4SSatish Balay t[2] = b[2 + idx]; 5469371c9d4SSatish Balay t[3] = b[3 + idx]; 5479371c9d4SSatish Balay t[4] = b[4 + idx]; 54828e30874SSatish Balay for (i = 1; i < n; i++) { 54928e30874SSatish Balay v = aa + 25 * ai[i]; 55028e30874SSatish Balay vi = aj + ai[i]; 55128e30874SSatish Balay nz = diag[i] - ai[i]; 55228e30874SSatish Balay idx = 5 * (*r++); 5539371c9d4SSatish Balay s1 = b[idx]; 5549371c9d4SSatish Balay s2 = b[1 + idx]; 5559371c9d4SSatish Balay s3 = b[2 + idx]; 5569371c9d4SSatish Balay s4 = b[3 + idx]; 55728e30874SSatish Balay s5 = b[4 + idx]; 55828e30874SSatish Balay while (nz--) { 55928e30874SSatish Balay idx = 5 * (*vi++); 5609371c9d4SSatish Balay x1 = t[idx]; 5619371c9d4SSatish Balay x2 = t[1 + idx]; 5629371c9d4SSatish Balay x3 = t[2 + idx]; 5639371c9d4SSatish Balay x4 = t[3 + idx]; 5649371c9d4SSatish Balay x5 = t[4 + idx]; 56528e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 56628e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 56728e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 56828e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 56928e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 57028e30874SSatish Balay v += 25; 57128e30874SSatish Balay } 57228e30874SSatish Balay idx = 5 * i; 5739371c9d4SSatish Balay t[idx] = s1; 5749371c9d4SSatish Balay t[1 + idx] = s2; 5759371c9d4SSatish Balay t[2 + idx] = s3; 5769371c9d4SSatish Balay t[3 + idx] = s4; 5779371c9d4SSatish Balay t[4 + idx] = s5; 57828e30874SSatish Balay } 57928e30874SSatish Balay /* backward solve the upper triangular */ 58028e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 58128e30874SSatish Balay v = aa + 25 * diag[i] + 25; 58228e30874SSatish Balay vi = aj + diag[i] + 1; 58328e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 58428e30874SSatish Balay idt = 5 * i; 5859371c9d4SSatish Balay s1 = t[idt]; 5869371c9d4SSatish Balay s2 = t[1 + idt]; 5879371c9d4SSatish Balay s3 = t[2 + idt]; 5889371c9d4SSatish Balay s4 = t[3 + idt]; 5899371c9d4SSatish Balay s5 = t[4 + idt]; 59028e30874SSatish Balay while (nz--) { 59128e30874SSatish Balay idx = 5 * (*vi++); 5929371c9d4SSatish Balay x1 = t[idx]; 5939371c9d4SSatish Balay x2 = t[1 + idx]; 5949371c9d4SSatish Balay x3 = t[2 + idx]; 5959371c9d4SSatish Balay x4 = t[3 + idx]; 5969371c9d4SSatish Balay x5 = t[4 + idx]; 59728e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 59828e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 59928e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 60028e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 60128e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 60228e30874SSatish Balay v += 25; 60328e30874SSatish Balay } 60428e30874SSatish Balay idc = 5 * (*c--); 60528e30874SSatish Balay v = aa + 25 * diag[i]; 6069371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 6079371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 6089371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 6099371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 6109371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 61128e30874SSatish Balay } 61228e30874SSatish Balay 6139566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 6149566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 6159566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 6169566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 6179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 618*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 61928e30874SSatish Balay } 62028e30874SSatish Balay 621d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx) 622d71ae5a4SJacob Faibussowitsch { 62328e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 62428e30874SSatish Balay IS iscol = a->col, isrow = a->row; 62528e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 62628e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 62728e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 62828e30874SSatish Balay const MatScalar *aa = a->a, *v; 62928e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 63028e30874SSatish Balay const PetscScalar *b; 63128e30874SSatish Balay 63228e30874SSatish Balay PetscFunctionBegin; 6339566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 6349566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 63528e30874SSatish Balay t = a->solve_work; 63628e30874SSatish Balay 6379371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 6389371c9d4SSatish Balay r = rout; 6399371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 6409371c9d4SSatish Balay c = cout; 64128e30874SSatish Balay 64228e30874SSatish Balay /* forward solve the lower triangular */ 64328e30874SSatish Balay idx = 5 * r[0]; 6449371c9d4SSatish Balay t[0] = b[idx]; 6459371c9d4SSatish Balay t[1] = b[1 + idx]; 6469371c9d4SSatish Balay t[2] = b[2 + idx]; 6479371c9d4SSatish Balay t[3] = b[3 + idx]; 6489371c9d4SSatish Balay t[4] = b[4 + idx]; 64928e30874SSatish Balay for (i = 1; i < n; i++) { 65028e30874SSatish Balay v = aa + 25 * ai[i]; 65128e30874SSatish Balay vi = aj + ai[i]; 65228e30874SSatish Balay nz = ai[i + 1] - ai[i]; 65328e30874SSatish Balay idx = 5 * r[i]; 6549371c9d4SSatish Balay s1 = b[idx]; 6559371c9d4SSatish Balay s2 = b[1 + idx]; 6569371c9d4SSatish Balay s3 = b[2 + idx]; 6579371c9d4SSatish Balay s4 = b[3 + idx]; 65828e30874SSatish Balay s5 = b[4 + idx]; 65928e30874SSatish Balay for (m = 0; m < nz; m++) { 66028e30874SSatish Balay idx = 5 * vi[m]; 6619371c9d4SSatish Balay x1 = t[idx]; 6629371c9d4SSatish Balay x2 = t[1 + idx]; 6639371c9d4SSatish Balay x3 = t[2 + idx]; 6649371c9d4SSatish Balay x4 = t[3 + idx]; 6659371c9d4SSatish Balay x5 = t[4 + idx]; 66628e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 66728e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 66828e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 66928e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 67028e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 67128e30874SSatish Balay v += 25; 67228e30874SSatish Balay } 67328e30874SSatish Balay idx = 5 * i; 6749371c9d4SSatish Balay t[idx] = s1; 6759371c9d4SSatish Balay t[1 + idx] = s2; 6769371c9d4SSatish Balay t[2 + idx] = s3; 6779371c9d4SSatish Balay t[3 + idx] = s4; 6789371c9d4SSatish Balay t[4 + idx] = s5; 67928e30874SSatish Balay } 68028e30874SSatish Balay /* backward solve the upper triangular */ 68128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 68228e30874SSatish Balay v = aa + 25 * (adiag[i + 1] + 1); 68328e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 68428e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 68528e30874SSatish Balay idt = 5 * i; 6869371c9d4SSatish Balay s1 = t[idt]; 6879371c9d4SSatish Balay s2 = t[1 + idt]; 6889371c9d4SSatish Balay s3 = t[2 + idt]; 6899371c9d4SSatish Balay s4 = t[3 + idt]; 6909371c9d4SSatish Balay s5 = t[4 + idt]; 69128e30874SSatish Balay for (m = 0; m < nz; m++) { 69228e30874SSatish Balay idx = 5 * vi[m]; 6939371c9d4SSatish Balay x1 = t[idx]; 6949371c9d4SSatish Balay x2 = t[1 + idx]; 6959371c9d4SSatish Balay x3 = t[2 + idx]; 6969371c9d4SSatish Balay x4 = t[3 + idx]; 6979371c9d4SSatish Balay x5 = t[4 + idx]; 69828e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 69928e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 70028e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 70128e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 70228e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 70328e30874SSatish Balay v += 25; 70428e30874SSatish Balay } 70528e30874SSatish Balay idc = 5 * c[i]; 7069371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 7079371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 7089371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 7099371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 7109371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 71128e30874SSatish Balay } 71228e30874SSatish Balay 7139566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 7149566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 7159566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 7169566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 7179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 718*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 71928e30874SSatish Balay } 72028e30874SSatish Balay 721d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx) 722d71ae5a4SJacob Faibussowitsch { 72328e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 72428e30874SSatish Balay IS iscol = a->col, isrow = a->row; 72528e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 72628e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 72728e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 72828e30874SSatish Balay const MatScalar *aa = a->a, *v; 72928e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 73028e30874SSatish Balay const PetscScalar *b; 73128e30874SSatish Balay 73228e30874SSatish Balay PetscFunctionBegin; 7339566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 7349566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 73528e30874SSatish Balay t = a->solve_work; 73628e30874SSatish Balay 7379371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 7389371c9d4SSatish Balay r = rout; 7399371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 7409371c9d4SSatish Balay c = cout + (n - 1); 74128e30874SSatish Balay 74228e30874SSatish Balay /* forward solve the lower triangular */ 74328e30874SSatish Balay idx = 4 * (*r++); 7449371c9d4SSatish Balay t[0] = b[idx]; 7459371c9d4SSatish Balay t[1] = b[1 + idx]; 7469371c9d4SSatish Balay t[2] = b[2 + idx]; 7479371c9d4SSatish Balay t[3] = b[3 + idx]; 74828e30874SSatish Balay for (i = 1; i < n; i++) { 74928e30874SSatish Balay v = aa + 16 * ai[i]; 75028e30874SSatish Balay vi = aj + ai[i]; 75128e30874SSatish Balay nz = diag[i] - ai[i]; 75228e30874SSatish Balay idx = 4 * (*r++); 7539371c9d4SSatish Balay s1 = b[idx]; 7549371c9d4SSatish Balay s2 = b[1 + idx]; 7559371c9d4SSatish Balay s3 = b[2 + idx]; 7569371c9d4SSatish Balay s4 = b[3 + idx]; 75728e30874SSatish Balay while (nz--) { 75828e30874SSatish Balay idx = 4 * (*vi++); 7599371c9d4SSatish Balay x1 = t[idx]; 7609371c9d4SSatish Balay x2 = t[1 + idx]; 7619371c9d4SSatish Balay x3 = t[2 + idx]; 7629371c9d4SSatish Balay x4 = t[3 + idx]; 76328e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 76428e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 76528e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 76628e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 76728e30874SSatish Balay v += 16; 76828e30874SSatish Balay } 76928e30874SSatish Balay idx = 4 * i; 7709371c9d4SSatish Balay t[idx] = s1; 7719371c9d4SSatish Balay t[1 + idx] = s2; 7729371c9d4SSatish Balay t[2 + idx] = s3; 7739371c9d4SSatish Balay t[3 + idx] = s4; 77428e30874SSatish Balay } 77528e30874SSatish Balay /* backward solve the upper triangular */ 77628e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 77728e30874SSatish Balay v = aa + 16 * diag[i] + 16; 77828e30874SSatish Balay vi = aj + diag[i] + 1; 77928e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 78028e30874SSatish Balay idt = 4 * i; 7819371c9d4SSatish Balay s1 = t[idt]; 7829371c9d4SSatish Balay s2 = t[1 + idt]; 7839371c9d4SSatish Balay s3 = t[2 + idt]; 7849371c9d4SSatish Balay s4 = t[3 + idt]; 78528e30874SSatish Balay while (nz--) { 78628e30874SSatish Balay idx = 4 * (*vi++); 7879371c9d4SSatish Balay x1 = t[idx]; 7889371c9d4SSatish Balay x2 = t[1 + idx]; 7899371c9d4SSatish Balay x3 = t[2 + idx]; 7909371c9d4SSatish Balay x4 = t[3 + idx]; 79128e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 79228e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 79328e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 79428e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 79528e30874SSatish Balay v += 16; 79628e30874SSatish Balay } 79728e30874SSatish Balay idc = 4 * (*c--); 79828e30874SSatish Balay v = aa + 16 * diag[i]; 79928e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 80028e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 80128e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 80228e30874SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 80328e30874SSatish Balay } 80428e30874SSatish Balay 8059566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 8069566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 8079566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 8089566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 8099566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 810*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 81128e30874SSatish Balay } 81228e30874SSatish Balay 813d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx) 814d71ae5a4SJacob Faibussowitsch { 81528e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 81628e30874SSatish Balay IS iscol = a->col, isrow = a->row; 81728e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 81828e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 81928e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 82028e30874SSatish Balay const MatScalar *aa = a->a, *v; 82128e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 82228e30874SSatish Balay const PetscScalar *b; 82328e30874SSatish Balay 82428e30874SSatish Balay PetscFunctionBegin; 8259566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 8269566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 82728e30874SSatish Balay t = a->solve_work; 82828e30874SSatish Balay 8299371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 8309371c9d4SSatish Balay r = rout; 8319371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 8329371c9d4SSatish Balay c = cout; 83328e30874SSatish Balay 83428e30874SSatish Balay /* forward solve the lower triangular */ 83528e30874SSatish Balay idx = 4 * r[0]; 8369371c9d4SSatish Balay t[0] = b[idx]; 8379371c9d4SSatish Balay t[1] = b[1 + idx]; 8389371c9d4SSatish Balay t[2] = b[2 + idx]; 8399371c9d4SSatish Balay t[3] = b[3 + idx]; 84028e30874SSatish Balay for (i = 1; i < n; i++) { 84128e30874SSatish Balay v = aa + 16 * ai[i]; 84228e30874SSatish Balay vi = aj + ai[i]; 84328e30874SSatish Balay nz = ai[i + 1] - ai[i]; 84428e30874SSatish Balay idx = 4 * r[i]; 8459371c9d4SSatish Balay s1 = b[idx]; 8469371c9d4SSatish Balay s2 = b[1 + idx]; 8479371c9d4SSatish Balay s3 = b[2 + idx]; 8489371c9d4SSatish Balay s4 = b[3 + idx]; 84928e30874SSatish Balay for (m = 0; m < nz; m++) { 85028e30874SSatish Balay idx = 4 * vi[m]; 8519371c9d4SSatish Balay x1 = t[idx]; 8529371c9d4SSatish Balay x2 = t[1 + idx]; 8539371c9d4SSatish Balay x3 = t[2 + idx]; 8549371c9d4SSatish Balay x4 = t[3 + idx]; 85528e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 85628e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 85728e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 85828e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 85928e30874SSatish Balay v += 16; 86028e30874SSatish Balay } 86128e30874SSatish Balay idx = 4 * i; 8629371c9d4SSatish Balay t[idx] = s1; 8639371c9d4SSatish Balay t[1 + idx] = s2; 8649371c9d4SSatish Balay t[2 + idx] = s3; 8659371c9d4SSatish Balay t[3 + idx] = s4; 86628e30874SSatish Balay } 86728e30874SSatish Balay /* backward solve the upper triangular */ 86828e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 86928e30874SSatish Balay v = aa + 16 * (adiag[i + 1] + 1); 87028e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 87128e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 87228e30874SSatish Balay idt = 4 * i; 8739371c9d4SSatish Balay s1 = t[idt]; 8749371c9d4SSatish Balay s2 = t[1 + idt]; 8759371c9d4SSatish Balay s3 = t[2 + idt]; 8769371c9d4SSatish Balay s4 = t[3 + idt]; 87728e30874SSatish Balay for (m = 0; m < nz; m++) { 87828e30874SSatish Balay idx = 4 * vi[m]; 8799371c9d4SSatish Balay x1 = t[idx]; 8809371c9d4SSatish Balay x2 = t[1 + idx]; 8819371c9d4SSatish Balay x3 = t[2 + idx]; 8829371c9d4SSatish Balay x4 = t[3 + idx]; 88328e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 88428e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 88528e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 88628e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 88728e30874SSatish Balay v += 16; 88828e30874SSatish Balay } 88928e30874SSatish Balay idc = 4 * c[i]; 89028e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 89128e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 89228e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 89328e30874SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 89428e30874SSatish Balay } 89528e30874SSatish Balay 8969566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 8979566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 8989566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 8999566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 9009566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 901*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 90228e30874SSatish Balay } 90328e30874SSatish Balay 904d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx) 905d71ae5a4SJacob Faibussowitsch { 90628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 90728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 90828e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 90928e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 91028e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 91128e30874SSatish Balay const MatScalar *aa = a->a, *v; 91228e30874SSatish Balay MatScalar s1, s2, s3, s4, x1, x2, x3, x4, *t; 91328e30874SSatish Balay PetscScalar *x; 91428e30874SSatish Balay const PetscScalar *b; 91528e30874SSatish Balay 91628e30874SSatish Balay PetscFunctionBegin; 9179566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 9189566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 91928e30874SSatish Balay t = (MatScalar *)a->solve_work; 92028e30874SSatish Balay 9219371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 9229371c9d4SSatish Balay r = rout; 9239371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 9249371c9d4SSatish Balay c = cout + (n - 1); 92528e30874SSatish Balay 92628e30874SSatish Balay /* forward solve the lower triangular */ 92728e30874SSatish Balay idx = 4 * (*r++); 92828e30874SSatish Balay t[0] = (MatScalar)b[idx]; 92928e30874SSatish Balay t[1] = (MatScalar)b[1 + idx]; 93028e30874SSatish Balay t[2] = (MatScalar)b[2 + idx]; 93128e30874SSatish Balay t[3] = (MatScalar)b[3 + idx]; 93228e30874SSatish Balay for (i = 1; i < n; i++) { 93328e30874SSatish Balay v = aa + 16 * ai[i]; 93428e30874SSatish Balay vi = aj + ai[i]; 93528e30874SSatish Balay nz = diag[i] - ai[i]; 93628e30874SSatish Balay idx = 4 * (*r++); 93728e30874SSatish Balay s1 = (MatScalar)b[idx]; 93828e30874SSatish Balay s2 = (MatScalar)b[1 + idx]; 93928e30874SSatish Balay s3 = (MatScalar)b[2 + idx]; 94028e30874SSatish Balay s4 = (MatScalar)b[3 + idx]; 94128e30874SSatish Balay while (nz--) { 94228e30874SSatish Balay idx = 4 * (*vi++); 94328e30874SSatish Balay x1 = t[idx]; 94428e30874SSatish Balay x2 = t[1 + idx]; 94528e30874SSatish Balay x3 = t[2 + idx]; 94628e30874SSatish Balay x4 = t[3 + idx]; 94728e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 94828e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 94928e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 95028e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 95128e30874SSatish Balay v += 16; 95228e30874SSatish Balay } 95328e30874SSatish Balay idx = 4 * i; 95428e30874SSatish Balay t[idx] = s1; 95528e30874SSatish Balay t[1 + idx] = s2; 95628e30874SSatish Balay t[2 + idx] = s3; 95728e30874SSatish Balay t[3 + idx] = s4; 95828e30874SSatish Balay } 95928e30874SSatish Balay /* backward solve the upper triangular */ 96028e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 96128e30874SSatish Balay v = aa + 16 * diag[i] + 16; 96228e30874SSatish Balay vi = aj + diag[i] + 1; 96328e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 96428e30874SSatish Balay idt = 4 * i; 96528e30874SSatish Balay s1 = t[idt]; 96628e30874SSatish Balay s2 = t[1 + idt]; 96728e30874SSatish Balay s3 = t[2 + idt]; 96828e30874SSatish Balay s4 = t[3 + idt]; 96928e30874SSatish Balay while (nz--) { 97028e30874SSatish Balay idx = 4 * (*vi++); 97128e30874SSatish Balay x1 = t[idx]; 97228e30874SSatish Balay x2 = t[1 + idx]; 97328e30874SSatish Balay x3 = t[2 + idx]; 97428e30874SSatish Balay x4 = t[3 + idx]; 97528e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 97628e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 97728e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 97828e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 97928e30874SSatish Balay v += 16; 98028e30874SSatish Balay } 98128e30874SSatish Balay idc = 4 * (*c--); 98228e30874SSatish Balay v = aa + 16 * diag[i]; 98328e30874SSatish Balay t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 98428e30874SSatish Balay t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 98528e30874SSatish Balay t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 98628e30874SSatish Balay t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 98728e30874SSatish Balay x[idc] = (PetscScalar)t[idt]; 98828e30874SSatish Balay x[1 + idc] = (PetscScalar)t[1 + idt]; 98928e30874SSatish Balay x[2 + idc] = (PetscScalar)t[2 + idt]; 99028e30874SSatish Balay x[3 + idc] = (PetscScalar)t[3 + idt]; 99128e30874SSatish Balay } 99228e30874SSatish Balay 9939566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 9949566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 9959566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 9969566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 9979566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 998*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 99928e30874SSatish Balay } 100028e30874SSatish Balay 100128e30874SSatish Balay #if defined(PETSC_HAVE_SSE) 100228e30874SSatish Balay 100328e30874SSatish Balay #include PETSC_HAVE_SSE 100428e30874SSatish Balay 1005d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx) 1006d71ae5a4SJacob Faibussowitsch { 100728e30874SSatish Balay /* 100828e30874SSatish Balay Note: This code uses demotion of double 100928e30874SSatish Balay to float when performing the mixed-mode computation. 101028e30874SSatish Balay This may not be numerically reasonable for all applications. 101128e30874SSatish Balay */ 101228e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 101328e30874SSatish Balay IS iscol = a->col, isrow = a->row; 101428e30874SSatish Balay PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16; 101528e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 101628e30874SSatish Balay MatScalar *aa = a->a, *v; 101728e30874SSatish Balay PetscScalar *x, *b, *t; 101828e30874SSatish Balay 101928e30874SSatish Balay /* Make space in temp stack for 16 Byte Aligned arrays */ 102028e30874SSatish Balay float ssealignedspace[11], *tmps, *tmpx; 102128e30874SSatish Balay unsigned long offset; 102228e30874SSatish Balay 102328e30874SSatish Balay PetscFunctionBegin; 102428e30874SSatish Balay SSE_SCOPE_BEGIN; 102528e30874SSatish Balay 102628e30874SSatish Balay offset = (unsigned long)ssealignedspace % 16; 102728e30874SSatish Balay if (offset) offset = (16 - offset) / 4; 102828e30874SSatish Balay tmps = &ssealignedspace[offset]; 102928e30874SSatish Balay tmpx = &ssealignedspace[offset + 4]; 103028e30874SSatish Balay PREFETCH_NTA(aa + 16 * ai[1]); 103128e30874SSatish Balay 10329566063dSJacob Faibussowitsch PetscCall(VecGetArray(bb, &b)); 10339566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 103428e30874SSatish Balay t = a->solve_work; 103528e30874SSatish Balay 10369371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 10379371c9d4SSatish Balay r = rout; 10389371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 10399371c9d4SSatish Balay c = cout + (n - 1); 104028e30874SSatish Balay 104128e30874SSatish Balay /* forward solve the lower triangular */ 104228e30874SSatish Balay idx = 4 * (*r++); 10439371c9d4SSatish Balay t[0] = b[idx]; 10449371c9d4SSatish Balay t[1] = b[1 + idx]; 10459371c9d4SSatish Balay t[2] = b[2 + idx]; 10469371c9d4SSatish Balay t[3] = b[3 + idx]; 104728e30874SSatish Balay v = aa + 16 * ai[1]; 104828e30874SSatish Balay 104928e30874SSatish Balay for (i = 1; i < n;) { 105028e30874SSatish Balay PREFETCH_NTA(&v[8]); 105128e30874SSatish Balay vi = aj + ai[i]; 105228e30874SSatish Balay nz = diag[i] - ai[i]; 105328e30874SSatish Balay idx = 4 * (*r++); 105428e30874SSatish Balay 105528e30874SSatish Balay /* Demote sum from double to float */ 105628e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]); 105728e30874SSatish Balay LOAD_PS(tmps, XMM7); 105828e30874SSatish Balay 105928e30874SSatish Balay while (nz--) { 106028e30874SSatish Balay PREFETCH_NTA(&v[16]); 106128e30874SSatish Balay idx = 4 * (*vi++); 106228e30874SSatish Balay 106328e30874SSatish Balay /* Demote solution (so far) from double to float */ 106428e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]); 106528e30874SSatish Balay 106628e30874SSatish Balay /* 4x4 Matrix-Vector product with negative accumulation: */ 106728e30874SSatish Balay SSE_INLINE_BEGIN_2(tmpx, v) 106828e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 106928e30874SSatish Balay 107028e30874SSatish Balay /* First Column */ 107128e30874SSatish Balay SSE_COPY_PS(XMM0, XMM6) 107228e30874SSatish Balay SSE_SHUFFLE(XMM0, XMM0, 0x00) 107328e30874SSatish Balay SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 107428e30874SSatish Balay SSE_SUB_PS(XMM7, XMM0) 107528e30874SSatish Balay 107628e30874SSatish Balay /* Second Column */ 107728e30874SSatish Balay SSE_COPY_PS(XMM1, XMM6) 107828e30874SSatish Balay SSE_SHUFFLE(XMM1, XMM1, 0x55) 107928e30874SSatish Balay SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 108028e30874SSatish Balay SSE_SUB_PS(XMM7, XMM1) 108128e30874SSatish Balay 108228e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 108328e30874SSatish Balay 108428e30874SSatish Balay /* Third Column */ 108528e30874SSatish Balay SSE_COPY_PS(XMM2, XMM6) 108628e30874SSatish Balay SSE_SHUFFLE(XMM2, XMM2, 0xAA) 108728e30874SSatish Balay SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 108828e30874SSatish Balay SSE_SUB_PS(XMM7, XMM2) 108928e30874SSatish Balay 109028e30874SSatish Balay /* Fourth Column */ 109128e30874SSatish Balay SSE_COPY_PS(XMM3, XMM6) 109228e30874SSatish Balay SSE_SHUFFLE(XMM3, XMM3, 0xFF) 109328e30874SSatish Balay SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 109428e30874SSatish Balay SSE_SUB_PS(XMM7, XMM3) 109528e30874SSatish Balay SSE_INLINE_END_2 109628e30874SSatish Balay 109728e30874SSatish Balay v += 16; 109828e30874SSatish Balay } 109928e30874SSatish Balay idx = 4 * i; 110028e30874SSatish Balay v = aa + 16 * ai[++i]; 110128e30874SSatish Balay PREFETCH_NTA(v); 110228e30874SSatish Balay STORE_PS(tmps, XMM7); 110328e30874SSatish Balay 110428e30874SSatish Balay /* Promote result from float to double */ 110528e30874SSatish Balay CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps); 110628e30874SSatish Balay } 110728e30874SSatish Balay /* backward solve the upper triangular */ 110828e30874SSatish Balay idt = 4 * (n - 1); 110928e30874SSatish Balay ai16 = 16 * diag[n - 1]; 111028e30874SSatish Balay v = aa + ai16 + 16; 111128e30874SSatish Balay for (i = n - 1; i >= 0;) { 111228e30874SSatish Balay PREFETCH_NTA(&v[8]); 111328e30874SSatish Balay vi = aj + diag[i] + 1; 111428e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 111528e30874SSatish Balay 111628e30874SSatish Balay /* Demote accumulator from double to float */ 111728e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]); 111828e30874SSatish Balay LOAD_PS(tmps, XMM7); 111928e30874SSatish Balay 112028e30874SSatish Balay while (nz--) { 112128e30874SSatish Balay PREFETCH_NTA(&v[16]); 112228e30874SSatish Balay idx = 4 * (*vi++); 112328e30874SSatish Balay 112428e30874SSatish Balay /* Demote solution (so far) from double to float */ 112528e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]); 112628e30874SSatish Balay 112728e30874SSatish Balay /* 4x4 Matrix-Vector Product with negative accumulation: */ 112828e30874SSatish Balay SSE_INLINE_BEGIN_2(tmpx, v) 112928e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 113028e30874SSatish Balay 113128e30874SSatish Balay /* First Column */ 113228e30874SSatish Balay SSE_COPY_PS(XMM0, XMM6) 113328e30874SSatish Balay SSE_SHUFFLE(XMM0, XMM0, 0x00) 113428e30874SSatish Balay SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 113528e30874SSatish Balay SSE_SUB_PS(XMM7, XMM0) 113628e30874SSatish Balay 113728e30874SSatish Balay /* Second Column */ 113828e30874SSatish Balay SSE_COPY_PS(XMM1, XMM6) 113928e30874SSatish Balay SSE_SHUFFLE(XMM1, XMM1, 0x55) 114028e30874SSatish Balay SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 114128e30874SSatish Balay SSE_SUB_PS(XMM7, XMM1) 114228e30874SSatish Balay 114328e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 114428e30874SSatish Balay 114528e30874SSatish Balay /* Third Column */ 114628e30874SSatish Balay SSE_COPY_PS(XMM2, XMM6) 114728e30874SSatish Balay SSE_SHUFFLE(XMM2, XMM2, 0xAA) 114828e30874SSatish Balay SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 114928e30874SSatish Balay SSE_SUB_PS(XMM7, XMM2) 115028e30874SSatish Balay 115128e30874SSatish Balay /* Fourth Column */ 115228e30874SSatish Balay SSE_COPY_PS(XMM3, XMM6) 115328e30874SSatish Balay SSE_SHUFFLE(XMM3, XMM3, 0xFF) 115428e30874SSatish Balay SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 115528e30874SSatish Balay SSE_SUB_PS(XMM7, XMM3) 115628e30874SSatish Balay SSE_INLINE_END_2 115728e30874SSatish Balay v += 16; 115828e30874SSatish Balay } 115928e30874SSatish Balay v = aa + ai16; 116028e30874SSatish Balay ai16 = 16 * diag[--i]; 116128e30874SSatish Balay PREFETCH_NTA(aa + ai16 + 16); 116228e30874SSatish Balay /* 116328e30874SSatish Balay Scale the result by the diagonal 4x4 block, 116428e30874SSatish Balay which was inverted as part of the factorization 116528e30874SSatish Balay */ 116628e30874SSatish Balay SSE_INLINE_BEGIN_3(v, tmps, aa + ai16) 116728e30874SSatish Balay /* First Column */ 116828e30874SSatish Balay SSE_COPY_PS(XMM0, XMM7) 116928e30874SSatish Balay SSE_SHUFFLE(XMM0, XMM0, 0x00) 117028e30874SSatish Balay SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 117128e30874SSatish Balay 117228e30874SSatish Balay /* Second Column */ 117328e30874SSatish Balay SSE_COPY_PS(XMM1, XMM7) 117428e30874SSatish Balay SSE_SHUFFLE(XMM1, XMM1, 0x55) 117528e30874SSatish Balay SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 117628e30874SSatish Balay SSE_ADD_PS(XMM0, XMM1) 117728e30874SSatish Balay 117828e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 117928e30874SSatish Balay 118028e30874SSatish Balay /* Third Column */ 118128e30874SSatish Balay SSE_COPY_PS(XMM2, XMM7) 118228e30874SSatish Balay SSE_SHUFFLE(XMM2, XMM2, 0xAA) 118328e30874SSatish Balay SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 118428e30874SSatish Balay SSE_ADD_PS(XMM0, XMM2) 118528e30874SSatish Balay 118628e30874SSatish Balay /* Fourth Column */ 118728e30874SSatish Balay SSE_COPY_PS(XMM3, XMM7) 118828e30874SSatish Balay SSE_SHUFFLE(XMM3, XMM3, 0xFF) 118928e30874SSatish Balay SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 119028e30874SSatish Balay SSE_ADD_PS(XMM0, XMM3) 119128e30874SSatish Balay 119228e30874SSatish Balay SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 119328e30874SSatish Balay SSE_INLINE_END_3 119428e30874SSatish Balay 119528e30874SSatish Balay /* Promote solution from float to double */ 119628e30874SSatish Balay CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps); 119728e30874SSatish Balay 119828e30874SSatish Balay /* Apply reordering to t and stream into x. */ 119928e30874SSatish Balay /* This way, x doesn't pollute the cache. */ 120028e30874SSatish Balay /* Be careful with size: 2 doubles = 4 floats! */ 120128e30874SSatish Balay idc = 4 * (*c--); 120228e30874SSatish Balay SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc]) 120328e30874SSatish Balay /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 120428e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0) 120528e30874SSatish Balay SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0) 120628e30874SSatish Balay /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 120728e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1) 120828e30874SSatish Balay SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1) 120928e30874SSatish Balay SSE_INLINE_END_2 121028e30874SSatish Balay v = aa + ai16 + 16; 121128e30874SSatish Balay idt -= 4; 121228e30874SSatish Balay } 121328e30874SSatish Balay 12149566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 12159566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 12169566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(bb, &b)); 12179566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 12189566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 121928e30874SSatish Balay SSE_SCOPE_END; 1220*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 122128e30874SSatish Balay } 122228e30874SSatish Balay 122328e30874SSatish Balay #endif 122428e30874SSatish Balay 1225d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx) 1226d71ae5a4SJacob Faibussowitsch { 122728e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 122828e30874SSatish Balay IS iscol = a->col, isrow = a->row; 122928e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 123028e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 123128e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 123228e30874SSatish Balay const MatScalar *aa = a->a, *v; 123328e30874SSatish Balay PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 123428e30874SSatish Balay const PetscScalar *b; 123528e30874SSatish Balay 123628e30874SSatish Balay PetscFunctionBegin; 12379566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 12389566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 123928e30874SSatish Balay t = a->solve_work; 124028e30874SSatish Balay 12419371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 12429371c9d4SSatish Balay r = rout; 12439371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 12449371c9d4SSatish Balay c = cout + (n - 1); 124528e30874SSatish Balay 124628e30874SSatish Balay /* forward solve the lower triangular */ 124728e30874SSatish Balay idx = 3 * (*r++); 12489371c9d4SSatish Balay t[0] = b[idx]; 12499371c9d4SSatish Balay t[1] = b[1 + idx]; 12509371c9d4SSatish Balay t[2] = b[2 + idx]; 125128e30874SSatish Balay for (i = 1; i < n; i++) { 125228e30874SSatish Balay v = aa + 9 * ai[i]; 125328e30874SSatish Balay vi = aj + ai[i]; 125428e30874SSatish Balay nz = diag[i] - ai[i]; 125528e30874SSatish Balay idx = 3 * (*r++); 12569371c9d4SSatish Balay s1 = b[idx]; 12579371c9d4SSatish Balay s2 = b[1 + idx]; 12589371c9d4SSatish Balay s3 = b[2 + idx]; 125928e30874SSatish Balay while (nz--) { 126028e30874SSatish Balay idx = 3 * (*vi++); 12619371c9d4SSatish Balay x1 = t[idx]; 12629371c9d4SSatish Balay x2 = t[1 + idx]; 12639371c9d4SSatish Balay x3 = t[2 + idx]; 126428e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 126528e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 126628e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 126728e30874SSatish Balay v += 9; 126828e30874SSatish Balay } 126928e30874SSatish Balay idx = 3 * i; 12709371c9d4SSatish Balay t[idx] = s1; 12719371c9d4SSatish Balay t[1 + idx] = s2; 12729371c9d4SSatish Balay t[2 + idx] = s3; 127328e30874SSatish Balay } 127428e30874SSatish Balay /* backward solve the upper triangular */ 127528e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 127628e30874SSatish Balay v = aa + 9 * diag[i] + 9; 127728e30874SSatish Balay vi = aj + diag[i] + 1; 127828e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 127928e30874SSatish Balay idt = 3 * i; 12809371c9d4SSatish Balay s1 = t[idt]; 12819371c9d4SSatish Balay s2 = t[1 + idt]; 12829371c9d4SSatish Balay s3 = t[2 + idt]; 128328e30874SSatish Balay while (nz--) { 128428e30874SSatish Balay idx = 3 * (*vi++); 12859371c9d4SSatish Balay x1 = t[idx]; 12869371c9d4SSatish Balay x2 = t[1 + idx]; 12879371c9d4SSatish Balay x3 = t[2 + idx]; 128828e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 128928e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 129028e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 129128e30874SSatish Balay v += 9; 129228e30874SSatish Balay } 129328e30874SSatish Balay idc = 3 * (*c--); 129428e30874SSatish Balay v = aa + 9 * diag[i]; 129528e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 129628e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 129728e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 129828e30874SSatish Balay } 12999566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 13009566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 13019566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 13029566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 13039566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 1304*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 130528e30874SSatish Balay } 130628e30874SSatish Balay 1307d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx) 1308d71ae5a4SJacob Faibussowitsch { 130928e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 131028e30874SSatish Balay IS iscol = a->col, isrow = a->row; 131128e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 131228e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 131328e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 131428e30874SSatish Balay const MatScalar *aa = a->a, *v; 131528e30874SSatish Balay PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 131628e30874SSatish Balay const PetscScalar *b; 131728e30874SSatish Balay 131828e30874SSatish Balay PetscFunctionBegin; 13199566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 13209566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 132128e30874SSatish Balay t = a->solve_work; 132228e30874SSatish Balay 13239371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 13249371c9d4SSatish Balay r = rout; 13259371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 13269371c9d4SSatish Balay c = cout; 132728e30874SSatish Balay 132828e30874SSatish Balay /* forward solve the lower triangular */ 132928e30874SSatish Balay idx = 3 * r[0]; 13309371c9d4SSatish Balay t[0] = b[idx]; 13319371c9d4SSatish Balay t[1] = b[1 + idx]; 13329371c9d4SSatish Balay t[2] = b[2 + idx]; 133328e30874SSatish Balay for (i = 1; i < n; i++) { 133428e30874SSatish Balay v = aa + 9 * ai[i]; 133528e30874SSatish Balay vi = aj + ai[i]; 133628e30874SSatish Balay nz = ai[i + 1] - ai[i]; 133728e30874SSatish Balay idx = 3 * r[i]; 13389371c9d4SSatish Balay s1 = b[idx]; 13399371c9d4SSatish Balay s2 = b[1 + idx]; 13409371c9d4SSatish Balay s3 = b[2 + idx]; 134128e30874SSatish Balay for (m = 0; m < nz; m++) { 134228e30874SSatish Balay idx = 3 * vi[m]; 13439371c9d4SSatish Balay x1 = t[idx]; 13449371c9d4SSatish Balay x2 = t[1 + idx]; 13459371c9d4SSatish Balay x3 = t[2 + idx]; 134628e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 134728e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 134828e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 134928e30874SSatish Balay v += 9; 135028e30874SSatish Balay } 135128e30874SSatish Balay idx = 3 * i; 13529371c9d4SSatish Balay t[idx] = s1; 13539371c9d4SSatish Balay t[1 + idx] = s2; 13549371c9d4SSatish Balay t[2 + idx] = s3; 135528e30874SSatish Balay } 135628e30874SSatish Balay /* backward solve the upper triangular */ 135728e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 135828e30874SSatish Balay v = aa + 9 * (adiag[i + 1] + 1); 135928e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 136028e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 136128e30874SSatish Balay idt = 3 * i; 13629371c9d4SSatish Balay s1 = t[idt]; 13639371c9d4SSatish Balay s2 = t[1 + idt]; 13649371c9d4SSatish Balay s3 = t[2 + idt]; 136528e30874SSatish Balay for (m = 0; m < nz; m++) { 136628e30874SSatish Balay idx = 3 * vi[m]; 13679371c9d4SSatish Balay x1 = t[idx]; 13689371c9d4SSatish Balay x2 = t[1 + idx]; 13699371c9d4SSatish Balay x3 = t[2 + idx]; 137028e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 137128e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 137228e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 137328e30874SSatish Balay v += 9; 137428e30874SSatish Balay } 137528e30874SSatish Balay idc = 3 * c[i]; 137628e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 137728e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 137828e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 137928e30874SSatish Balay } 13809566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 13819566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 13829566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 13839566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 13849566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 1385*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 138628e30874SSatish Balay } 138728e30874SSatish Balay 1388d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx) 1389d71ae5a4SJacob Faibussowitsch { 139028e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 139128e30874SSatish Balay IS iscol = a->col, isrow = a->row; 139228e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 139328e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 139428e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 139528e30874SSatish Balay const MatScalar *aa = a->a, *v; 139628e30874SSatish Balay PetscScalar *x, s1, s2, x1, x2, *t; 139728e30874SSatish Balay const PetscScalar *b; 139828e30874SSatish Balay 139928e30874SSatish Balay PetscFunctionBegin; 14009566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 14019566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 140228e30874SSatish Balay t = a->solve_work; 140328e30874SSatish Balay 14049371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 14059371c9d4SSatish Balay r = rout; 14069371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 14079371c9d4SSatish Balay c = cout + (n - 1); 140828e30874SSatish Balay 140928e30874SSatish Balay /* forward solve the lower triangular */ 141028e30874SSatish Balay idx = 2 * (*r++); 14119371c9d4SSatish Balay t[0] = b[idx]; 14129371c9d4SSatish Balay t[1] = b[1 + idx]; 141328e30874SSatish Balay for (i = 1; i < n; i++) { 141428e30874SSatish Balay v = aa + 4 * ai[i]; 141528e30874SSatish Balay vi = aj + ai[i]; 141628e30874SSatish Balay nz = diag[i] - ai[i]; 141728e30874SSatish Balay idx = 2 * (*r++); 14189371c9d4SSatish Balay s1 = b[idx]; 14199371c9d4SSatish Balay s2 = b[1 + idx]; 142028e30874SSatish Balay while (nz--) { 142128e30874SSatish Balay idx = 2 * (*vi++); 14229371c9d4SSatish Balay x1 = t[idx]; 14239371c9d4SSatish Balay x2 = t[1 + idx]; 142428e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 142528e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 142628e30874SSatish Balay v += 4; 142728e30874SSatish Balay } 142828e30874SSatish Balay idx = 2 * i; 14299371c9d4SSatish Balay t[idx] = s1; 14309371c9d4SSatish Balay t[1 + idx] = s2; 143128e30874SSatish Balay } 143228e30874SSatish Balay /* backward solve the upper triangular */ 143328e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 143428e30874SSatish Balay v = aa + 4 * diag[i] + 4; 143528e30874SSatish Balay vi = aj + diag[i] + 1; 143628e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 143728e30874SSatish Balay idt = 2 * i; 14389371c9d4SSatish Balay s1 = t[idt]; 14399371c9d4SSatish Balay s2 = t[1 + idt]; 144028e30874SSatish Balay while (nz--) { 144128e30874SSatish Balay idx = 2 * (*vi++); 14429371c9d4SSatish Balay x1 = t[idx]; 14439371c9d4SSatish Balay x2 = t[1 + idx]; 144428e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 144528e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 144628e30874SSatish Balay v += 4; 144728e30874SSatish Balay } 144828e30874SSatish Balay idc = 2 * (*c--); 144928e30874SSatish Balay v = aa + 4 * diag[i]; 145028e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 145128e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 145228e30874SSatish Balay } 14539566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 14549566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 14559566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 14569566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 14579566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 1458*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 145928e30874SSatish Balay } 146028e30874SSatish Balay 1461d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx) 1462d71ae5a4SJacob Faibussowitsch { 146328e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 146428e30874SSatish Balay IS iscol = a->col, isrow = a->row; 146528e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 146628e30874SSatish Balay PetscInt i, nz, idx, jdx, idt, idc, m; 146728e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 146828e30874SSatish Balay const MatScalar *aa = a->a, *v; 146928e30874SSatish Balay PetscScalar *x, s1, s2, x1, x2, *t; 147028e30874SSatish Balay const PetscScalar *b; 147128e30874SSatish Balay 147228e30874SSatish Balay PetscFunctionBegin; 14739566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 14749566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 147528e30874SSatish Balay t = a->solve_work; 147628e30874SSatish Balay 14779371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 14789371c9d4SSatish Balay r = rout; 14799371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 14809371c9d4SSatish Balay c = cout; 148128e30874SSatish Balay 148228e30874SSatish Balay /* forward solve the lower triangular */ 148328e30874SSatish Balay idx = 2 * r[0]; 14849371c9d4SSatish Balay t[0] = b[idx]; 14859371c9d4SSatish Balay t[1] = b[1 + idx]; 148628e30874SSatish Balay for (i = 1; i < n; i++) { 148728e30874SSatish Balay v = aa + 4 * ai[i]; 148828e30874SSatish Balay vi = aj + ai[i]; 148928e30874SSatish Balay nz = ai[i + 1] - ai[i]; 149028e30874SSatish Balay idx = 2 * r[i]; 14919371c9d4SSatish Balay s1 = b[idx]; 14929371c9d4SSatish Balay s2 = b[1 + idx]; 149328e30874SSatish Balay for (m = 0; m < nz; m++) { 149428e30874SSatish Balay jdx = 2 * vi[m]; 14959371c9d4SSatish Balay x1 = t[jdx]; 14969371c9d4SSatish Balay x2 = t[1 + jdx]; 149728e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 149828e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 149928e30874SSatish Balay v += 4; 150028e30874SSatish Balay } 150128e30874SSatish Balay idx = 2 * i; 15029371c9d4SSatish Balay t[idx] = s1; 15039371c9d4SSatish Balay t[1 + idx] = s2; 150428e30874SSatish Balay } 150528e30874SSatish Balay /* backward solve the upper triangular */ 150628e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 150728e30874SSatish Balay v = aa + 4 * (adiag[i + 1] + 1); 150828e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 150928e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 151028e30874SSatish Balay idt = 2 * i; 15119371c9d4SSatish Balay s1 = t[idt]; 15129371c9d4SSatish Balay s2 = t[1 + idt]; 151328e30874SSatish Balay for (m = 0; m < nz; m++) { 151428e30874SSatish Balay idx = 2 * vi[m]; 15159371c9d4SSatish Balay x1 = t[idx]; 15169371c9d4SSatish Balay x2 = t[1 + idx]; 151728e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 151828e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 151928e30874SSatish Balay v += 4; 152028e30874SSatish Balay } 152128e30874SSatish Balay idc = 2 * c[i]; 152228e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 152328e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 152428e30874SSatish Balay } 15259566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 15269566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 15279566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 15289566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 15299566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 1530*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 153128e30874SSatish Balay } 153228e30874SSatish Balay 1533d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1534d71ae5a4SJacob Faibussowitsch { 153528e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 153628e30874SSatish Balay IS iscol = a->col, isrow = a->row; 153728e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 153828e30874SSatish Balay PetscInt i, nz; 153928e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 154028e30874SSatish Balay const MatScalar *aa = a->a, *v; 154128e30874SSatish Balay PetscScalar *x, s1, *t; 154228e30874SSatish Balay const PetscScalar *b; 154328e30874SSatish Balay 154428e30874SSatish Balay PetscFunctionBegin; 1545*3ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 154628e30874SSatish Balay 15479566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 15489566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 154928e30874SSatish Balay t = a->solve_work; 155028e30874SSatish Balay 15519371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 15529371c9d4SSatish Balay r = rout; 15539371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 15549371c9d4SSatish Balay c = cout + (n - 1); 155528e30874SSatish Balay 155628e30874SSatish Balay /* forward solve the lower triangular */ 155728e30874SSatish Balay t[0] = b[*r++]; 155828e30874SSatish Balay for (i = 1; i < n; i++) { 155928e30874SSatish Balay v = aa + ai[i]; 156028e30874SSatish Balay vi = aj + ai[i]; 156128e30874SSatish Balay nz = diag[i] - ai[i]; 156228e30874SSatish Balay s1 = b[*r++]; 1563ad540459SPierre Jolivet while (nz--) s1 -= (*v++) * t[*vi++]; 156428e30874SSatish Balay t[i] = s1; 156528e30874SSatish Balay } 156628e30874SSatish Balay /* backward solve the upper triangular */ 156728e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 156828e30874SSatish Balay v = aa + diag[i] + 1; 156928e30874SSatish Balay vi = aj + diag[i] + 1; 157028e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 157128e30874SSatish Balay s1 = t[i]; 1572ad540459SPierre Jolivet while (nz--) s1 -= (*v++) * t[*vi++]; 157328e30874SSatish Balay x[*c--] = t[i] = aa[diag[i]] * s1; 157428e30874SSatish Balay } 157528e30874SSatish Balay 15769566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 15779566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 15789566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 15799566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 15809566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n)); 1581*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 158228e30874SSatish Balay } 158328e30874SSatish Balay 1584d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx) 1585d71ae5a4SJacob Faibussowitsch { 158628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 158728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 158828e30874SSatish Balay PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz; 158928e30874SSatish Balay const PetscInt *rout, *cout, *r, *c; 159028e30874SSatish Balay PetscScalar *x, *tmp, sum; 159128e30874SSatish Balay const PetscScalar *b; 159228e30874SSatish Balay const MatScalar *aa = a->a, *v; 159328e30874SSatish Balay 159428e30874SSatish Balay PetscFunctionBegin; 1595*3ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 159628e30874SSatish Balay 15979566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 15989566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 159928e30874SSatish Balay tmp = a->solve_work; 160028e30874SSatish Balay 16019371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 16029371c9d4SSatish Balay r = rout; 16039371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 16049371c9d4SSatish Balay c = cout; 160528e30874SSatish Balay 160628e30874SSatish Balay /* forward solve the lower triangular */ 160728e30874SSatish Balay tmp[0] = b[r[0]]; 160828e30874SSatish Balay v = aa; 160928e30874SSatish Balay vi = aj; 161028e30874SSatish Balay for (i = 1; i < n; i++) { 161128e30874SSatish Balay nz = ai[i + 1] - ai[i]; 161228e30874SSatish Balay sum = b[r[i]]; 161328e30874SSatish Balay PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 161428e30874SSatish Balay tmp[i] = sum; 16159371c9d4SSatish Balay v += nz; 16169371c9d4SSatish Balay vi += nz; 161728e30874SSatish Balay } 161828e30874SSatish Balay 161928e30874SSatish Balay /* backward solve the upper triangular */ 162028e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 162128e30874SSatish Balay v = aa + adiag[i + 1] + 1; 162228e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 162328e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 162428e30874SSatish Balay sum = tmp[i]; 162528e30874SSatish Balay PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 162628e30874SSatish Balay x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */ 162728e30874SSatish Balay } 162828e30874SSatish Balay 16299566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 16309566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 16319566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 16329566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 16339566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 1634*3ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 163528e30874SSatish Balay } 1636