128e30874SSatish Balay #include <../src/mat/impls/baij/seq/baij.h> 2af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 328e30874SSatish Balay 4d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx) 5d71ae5a4SJacob Faibussowitsch { 628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 828e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 928e30874SSatish Balay const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *vi; 1028e30874SSatish Balay PetscInt i, nz; 1128e30874SSatish Balay const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 1228e30874SSatish Balay const MatScalar *aa = a->a, *v; 1328e30874SSatish Balay PetscScalar *x, *s, *t, *ls; 1428e30874SSatish Balay const PetscScalar *b; 1528e30874SSatish Balay 1628e30874SSatish Balay PetscFunctionBegin; 17*ce6b3536SJed Brown PetscCheck(bs > 0, PETSC_COMM_SELF, PETSC_ERR_PLIB, "Expected bs %" PetscInt_FMT " > 0", bs); 189566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 199566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 2028e30874SSatish Balay t = a->solve_work; 2128e30874SSatish Balay 229371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 239371c9d4SSatish Balay r = rout; 249371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 259371c9d4SSatish Balay c = cout + (n - 1); 2628e30874SSatish Balay 2728e30874SSatish Balay /* forward solve the lower triangular */ 289566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t, b + bs * (*r++), bs)); 2928e30874SSatish Balay for (i = 1; i < n; i++) { 3028e30874SSatish Balay v = aa + bs2 * ai[i]; 3128e30874SSatish Balay vi = aj + ai[i]; 3228e30874SSatish Balay nz = a->diag[i] - ai[i]; 3328e30874SSatish Balay s = t + bs * i; 349566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(s, b + bs * (*r++), bs)); 3528e30874SSatish Balay while (nz--) { 3628e30874SSatish Balay PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++)); 3728e30874SSatish Balay v += bs2; 3828e30874SSatish Balay } 3928e30874SSatish Balay } 4028e30874SSatish Balay /* backward solve the upper triangular */ 4128e30874SSatish Balay ls = a->solve_work + A->cmap->n; 4228e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 4328e30874SSatish Balay v = aa + bs2 * (a->diag[i] + 1); 4428e30874SSatish Balay vi = aj + a->diag[i] + 1; 4528e30874SSatish Balay nz = ai[i + 1] - a->diag[i] - 1; 469566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 4728e30874SSatish Balay while (nz--) { 4828e30874SSatish Balay PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++)); 4928e30874SSatish Balay v += bs2; 5028e30874SSatish Balay } 5128e30874SSatish Balay PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs); 529566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs)); 5328e30874SSatish Balay } 5428e30874SSatish Balay 559566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 569566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 579566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 589566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 599566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 603ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 6128e30874SSatish Balay } 6228e30874SSatish Balay 63d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx) 64d71ae5a4SJacob Faibussowitsch { 6528e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6628e30874SSatish Balay IS iscol = a->col, isrow = a->row; 6728e30874SSatish Balay const PetscInt *r, *c, *ai = a->i, *aj = a->j; 6828e30874SSatish Balay const PetscInt *rout, *cout, *diag = a->diag, *vi, n = a->mbs; 6928e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 7028e30874SSatish Balay const MatScalar *aa = a->a, *v; 7128e30874SSatish Balay PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 7228e30874SSatish Balay const PetscScalar *b; 7328e30874SSatish Balay 7428e30874SSatish Balay PetscFunctionBegin; 759566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 769566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 7728e30874SSatish Balay t = a->solve_work; 7828e30874SSatish Balay 799371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 809371c9d4SSatish Balay r = rout; 819371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 829371c9d4SSatish Balay c = cout + (n - 1); 8328e30874SSatish Balay 8428e30874SSatish Balay /* forward solve the lower triangular */ 8528e30874SSatish Balay idx = 7 * (*r++); 869371c9d4SSatish Balay t[0] = b[idx]; 879371c9d4SSatish Balay t[1] = b[1 + idx]; 889371c9d4SSatish Balay t[2] = b[2 + idx]; 899371c9d4SSatish Balay t[3] = b[3 + idx]; 909371c9d4SSatish Balay t[4] = b[4 + idx]; 919371c9d4SSatish Balay t[5] = b[5 + idx]; 929371c9d4SSatish Balay t[6] = b[6 + idx]; 9328e30874SSatish Balay 9428e30874SSatish Balay for (i = 1; i < n; i++) { 9528e30874SSatish Balay v = aa + 49 * ai[i]; 9628e30874SSatish Balay vi = aj + ai[i]; 9728e30874SSatish Balay nz = diag[i] - ai[i]; 9828e30874SSatish Balay idx = 7 * (*r++); 999371c9d4SSatish Balay s1 = b[idx]; 1009371c9d4SSatish Balay s2 = b[1 + idx]; 1019371c9d4SSatish Balay s3 = b[2 + idx]; 1029371c9d4SSatish Balay s4 = b[3 + idx]; 1039371c9d4SSatish Balay s5 = b[4 + idx]; 1049371c9d4SSatish Balay s6 = b[5 + idx]; 1059371c9d4SSatish Balay s7 = b[6 + idx]; 10628e30874SSatish Balay while (nz--) { 10728e30874SSatish Balay idx = 7 * (*vi++); 1089371c9d4SSatish Balay x1 = t[idx]; 1099371c9d4SSatish Balay x2 = t[1 + idx]; 1109371c9d4SSatish Balay x3 = t[2 + idx]; 1119371c9d4SSatish Balay x4 = t[3 + idx]; 1129371c9d4SSatish Balay x5 = t[4 + idx]; 1139371c9d4SSatish Balay x6 = t[5 + idx]; 1149371c9d4SSatish Balay x7 = t[6 + idx]; 11528e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 11628e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 11728e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 11828e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 11928e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 12028e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 12128e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 12228e30874SSatish Balay v += 49; 12328e30874SSatish Balay } 12428e30874SSatish Balay idx = 7 * i; 1259371c9d4SSatish Balay t[idx] = s1; 1269371c9d4SSatish Balay t[1 + idx] = s2; 1279371c9d4SSatish Balay t[2 + idx] = s3; 1289371c9d4SSatish Balay t[3 + idx] = s4; 1299371c9d4SSatish Balay t[4 + idx] = s5; 1309371c9d4SSatish Balay t[5 + idx] = s6; 1319371c9d4SSatish Balay t[6 + idx] = s7; 13228e30874SSatish Balay } 13328e30874SSatish Balay /* backward solve the upper triangular */ 13428e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 13528e30874SSatish Balay v = aa + 49 * diag[i] + 49; 13628e30874SSatish Balay vi = aj + diag[i] + 1; 13728e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 13828e30874SSatish Balay idt = 7 * i; 1399371c9d4SSatish Balay s1 = t[idt]; 1409371c9d4SSatish Balay s2 = t[1 + idt]; 1419371c9d4SSatish Balay s3 = t[2 + idt]; 1429371c9d4SSatish Balay s4 = t[3 + idt]; 1439371c9d4SSatish Balay s5 = t[4 + idt]; 1449371c9d4SSatish Balay s6 = t[5 + idt]; 1459371c9d4SSatish Balay s7 = t[6 + idt]; 14628e30874SSatish Balay while (nz--) { 14728e30874SSatish Balay idx = 7 * (*vi++); 1489371c9d4SSatish Balay x1 = t[idx]; 1499371c9d4SSatish Balay x2 = t[1 + idx]; 1509371c9d4SSatish Balay x3 = t[2 + idx]; 1519371c9d4SSatish Balay x4 = t[3 + idx]; 1529371c9d4SSatish Balay x5 = t[4 + idx]; 1539371c9d4SSatish Balay x6 = t[5 + idx]; 1549371c9d4SSatish Balay x7 = t[6 + idx]; 15528e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 15628e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 15728e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 15828e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 15928e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 16028e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 16128e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 16228e30874SSatish Balay v += 49; 16328e30874SSatish Balay } 16428e30874SSatish Balay idc = 7 * (*c--); 16528e30874SSatish Balay v = aa + 49 * diag[i]; 1669371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 1679371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 1689371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 1699371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 1709371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 1719371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 1729371c9d4SSatish Balay x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 17328e30874SSatish Balay } 17428e30874SSatish Balay 1759566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 1769566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 1779566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 1789566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 1799566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 1803ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 18128e30874SSatish Balay } 18228e30874SSatish Balay 183d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx) 184d71ae5a4SJacob Faibussowitsch { 18528e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 18628e30874SSatish Balay IS iscol = a->col, isrow = a->row; 18728e30874SSatish Balay const PetscInt *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag; 18828e30874SSatish Balay const PetscInt n = a->mbs, *rout, *cout, *vi; 18928e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 19028e30874SSatish Balay const MatScalar *aa = a->a, *v; 19128e30874SSatish Balay PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 19228e30874SSatish Balay const PetscScalar *b; 19328e30874SSatish Balay 19428e30874SSatish Balay PetscFunctionBegin; 1959566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 1969566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 19728e30874SSatish Balay t = a->solve_work; 19828e30874SSatish Balay 1999371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 2009371c9d4SSatish Balay r = rout; 2019371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 2029371c9d4SSatish Balay c = cout; 20328e30874SSatish Balay 20428e30874SSatish Balay /* forward solve the lower triangular */ 20528e30874SSatish Balay idx = 7 * r[0]; 2069371c9d4SSatish Balay t[0] = b[idx]; 2079371c9d4SSatish Balay t[1] = b[1 + idx]; 2089371c9d4SSatish Balay t[2] = b[2 + idx]; 2099371c9d4SSatish Balay t[3] = b[3 + idx]; 2109371c9d4SSatish Balay t[4] = b[4 + idx]; 2119371c9d4SSatish Balay t[5] = b[5 + idx]; 2129371c9d4SSatish Balay t[6] = b[6 + idx]; 21328e30874SSatish Balay 21428e30874SSatish Balay for (i = 1; i < n; i++) { 21528e30874SSatish Balay v = aa + 49 * ai[i]; 21628e30874SSatish Balay vi = aj + ai[i]; 21728e30874SSatish Balay nz = ai[i + 1] - ai[i]; 21828e30874SSatish Balay idx = 7 * r[i]; 2199371c9d4SSatish Balay s1 = b[idx]; 2209371c9d4SSatish Balay s2 = b[1 + idx]; 2219371c9d4SSatish Balay s3 = b[2 + idx]; 2229371c9d4SSatish Balay s4 = b[3 + idx]; 2239371c9d4SSatish Balay s5 = b[4 + idx]; 2249371c9d4SSatish Balay s6 = b[5 + idx]; 2259371c9d4SSatish Balay s7 = b[6 + idx]; 22628e30874SSatish Balay for (m = 0; m < nz; m++) { 22728e30874SSatish Balay idx = 7 * vi[m]; 2289371c9d4SSatish Balay x1 = t[idx]; 2299371c9d4SSatish Balay x2 = t[1 + idx]; 2309371c9d4SSatish Balay x3 = t[2 + idx]; 2319371c9d4SSatish Balay x4 = t[3 + idx]; 2329371c9d4SSatish Balay x5 = t[4 + idx]; 2339371c9d4SSatish Balay x6 = t[5 + idx]; 2349371c9d4SSatish Balay x7 = t[6 + idx]; 23528e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 23628e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 23728e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 23828e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 23928e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 24028e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 24128e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 24228e30874SSatish Balay v += 49; 24328e30874SSatish Balay } 24428e30874SSatish Balay idx = 7 * i; 2459371c9d4SSatish Balay t[idx] = s1; 2469371c9d4SSatish Balay t[1 + idx] = s2; 2479371c9d4SSatish Balay t[2 + idx] = s3; 2489371c9d4SSatish Balay t[3 + idx] = s4; 2499371c9d4SSatish Balay t[4 + idx] = s5; 2509371c9d4SSatish Balay t[5 + idx] = s6; 2519371c9d4SSatish Balay t[6 + idx] = s7; 25228e30874SSatish Balay } 25328e30874SSatish Balay /* backward solve the upper triangular */ 25428e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 25528e30874SSatish Balay v = aa + 49 * (adiag[i + 1] + 1); 25628e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 25728e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 25828e30874SSatish Balay idt = 7 * i; 2599371c9d4SSatish Balay s1 = t[idt]; 2609371c9d4SSatish Balay s2 = t[1 + idt]; 2619371c9d4SSatish Balay s3 = t[2 + idt]; 2629371c9d4SSatish Balay s4 = t[3 + idt]; 2639371c9d4SSatish Balay s5 = t[4 + idt]; 2649371c9d4SSatish Balay s6 = t[5 + idt]; 2659371c9d4SSatish Balay s7 = t[6 + idt]; 26628e30874SSatish Balay for (m = 0; m < nz; m++) { 26728e30874SSatish Balay idx = 7 * vi[m]; 2689371c9d4SSatish Balay x1 = t[idx]; 2699371c9d4SSatish Balay x2 = t[1 + idx]; 2709371c9d4SSatish Balay x3 = t[2 + idx]; 2719371c9d4SSatish Balay x4 = t[3 + idx]; 2729371c9d4SSatish Balay x5 = t[4 + idx]; 2739371c9d4SSatish Balay x6 = t[5 + idx]; 2749371c9d4SSatish Balay x7 = t[6 + idx]; 27528e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 27628e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 27728e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 27828e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 27928e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 28028e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 28128e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 28228e30874SSatish Balay v += 49; 28328e30874SSatish Balay } 28428e30874SSatish Balay idc = 7 * c[i]; 2859371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 2869371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 2879371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 2889371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 2899371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 2909371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 2919371c9d4SSatish Balay x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 29228e30874SSatish Balay } 29328e30874SSatish Balay 2949566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 2959566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 2969566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 2979566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 2989566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 2993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 30028e30874SSatish Balay } 30128e30874SSatish Balay 302d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx) 303d71ae5a4SJacob Faibussowitsch { 30428e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 30528e30874SSatish Balay IS iscol = a->col, isrow = a->row; 30628e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 30728e30874SSatish Balay const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 30828e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 30928e30874SSatish Balay const MatScalar *aa = a->a, *v; 31028e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 31128e30874SSatish Balay const PetscScalar *b; 31228e30874SSatish Balay 31328e30874SSatish Balay PetscFunctionBegin; 3149566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 3159566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 31628e30874SSatish Balay t = a->solve_work; 31728e30874SSatish Balay 3189371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 3199371c9d4SSatish Balay r = rout; 3209371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 3219371c9d4SSatish Balay c = cout + (n - 1); 32228e30874SSatish Balay 32328e30874SSatish Balay /* forward solve the lower triangular */ 32428e30874SSatish Balay idx = 6 * (*r++); 3259371c9d4SSatish Balay t[0] = b[idx]; 3269371c9d4SSatish Balay t[1] = b[1 + idx]; 3279371c9d4SSatish Balay t[2] = b[2 + idx]; 3289371c9d4SSatish Balay t[3] = b[3 + idx]; 3299371c9d4SSatish Balay t[4] = b[4 + idx]; 3309371c9d4SSatish Balay t[5] = b[5 + idx]; 33128e30874SSatish Balay for (i = 1; i < n; i++) { 33228e30874SSatish Balay v = aa + 36 * ai[i]; 33328e30874SSatish Balay vi = aj + ai[i]; 33428e30874SSatish Balay nz = diag[i] - ai[i]; 33528e30874SSatish Balay idx = 6 * (*r++); 3369371c9d4SSatish Balay s1 = b[idx]; 3379371c9d4SSatish Balay s2 = b[1 + idx]; 3389371c9d4SSatish Balay s3 = b[2 + idx]; 3399371c9d4SSatish Balay s4 = b[3 + idx]; 3409371c9d4SSatish Balay s5 = b[4 + idx]; 3419371c9d4SSatish Balay s6 = b[5 + idx]; 34228e30874SSatish Balay while (nz--) { 34328e30874SSatish Balay idx = 6 * (*vi++); 3449371c9d4SSatish Balay x1 = t[idx]; 3459371c9d4SSatish Balay x2 = t[1 + idx]; 3469371c9d4SSatish Balay x3 = t[2 + idx]; 3479371c9d4SSatish Balay x4 = t[3 + idx]; 3489371c9d4SSatish Balay x5 = t[4 + idx]; 3499371c9d4SSatish Balay x6 = t[5 + idx]; 35028e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 35128e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 35228e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 35328e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 35428e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 35528e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 35628e30874SSatish Balay v += 36; 35728e30874SSatish Balay } 35828e30874SSatish Balay idx = 6 * i; 3599371c9d4SSatish Balay t[idx] = s1; 3609371c9d4SSatish Balay t[1 + idx] = s2; 3619371c9d4SSatish Balay t[2 + idx] = s3; 3629371c9d4SSatish Balay t[3 + idx] = s4; 3639371c9d4SSatish Balay t[4 + idx] = s5; 3649371c9d4SSatish Balay t[5 + idx] = s6; 36528e30874SSatish Balay } 36628e30874SSatish Balay /* backward solve the upper triangular */ 36728e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 36828e30874SSatish Balay v = aa + 36 * diag[i] + 36; 36928e30874SSatish Balay vi = aj + diag[i] + 1; 37028e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 37128e30874SSatish Balay idt = 6 * i; 3729371c9d4SSatish Balay s1 = t[idt]; 3739371c9d4SSatish Balay s2 = t[1 + idt]; 3749371c9d4SSatish Balay s3 = t[2 + idt]; 3759371c9d4SSatish Balay s4 = t[3 + idt]; 3769371c9d4SSatish Balay s5 = t[4 + idt]; 3779371c9d4SSatish Balay s6 = t[5 + idt]; 37828e30874SSatish Balay while (nz--) { 37928e30874SSatish Balay idx = 6 * (*vi++); 3809371c9d4SSatish Balay x1 = t[idx]; 3819371c9d4SSatish Balay x2 = t[1 + idx]; 3829371c9d4SSatish Balay x3 = t[2 + idx]; 3839371c9d4SSatish Balay x4 = t[3 + idx]; 3849371c9d4SSatish Balay x5 = t[4 + idx]; 3859371c9d4SSatish Balay x6 = t[5 + idx]; 38628e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 38728e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 38828e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 38928e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 39028e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 39128e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 39228e30874SSatish Balay v += 36; 39328e30874SSatish Balay } 39428e30874SSatish Balay idc = 6 * (*c--); 39528e30874SSatish Balay v = aa + 36 * diag[i]; 3969371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 3979371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 3989371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 3999371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 4009371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 4019371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 40228e30874SSatish Balay } 40328e30874SSatish Balay 4049566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 4059566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 4069566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 4079566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 4089566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 4093ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 41028e30874SSatish Balay } 41128e30874SSatish Balay 412d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx) 413d71ae5a4SJacob Faibussowitsch { 41428e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 41528e30874SSatish Balay IS iscol = a->col, isrow = a->row; 41628e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 41728e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 41828e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 41928e30874SSatish Balay const MatScalar *aa = a->a, *v; 42028e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 42128e30874SSatish Balay const PetscScalar *b; 42228e30874SSatish Balay 42328e30874SSatish Balay PetscFunctionBegin; 4249566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 4259566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 42628e30874SSatish Balay t = a->solve_work; 42728e30874SSatish Balay 4289371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 4299371c9d4SSatish Balay r = rout; 4309371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 4319371c9d4SSatish Balay c = cout; 43228e30874SSatish Balay 43328e30874SSatish Balay /* forward solve the lower triangular */ 43428e30874SSatish Balay idx = 6 * r[0]; 4359371c9d4SSatish Balay t[0] = b[idx]; 4369371c9d4SSatish Balay t[1] = b[1 + idx]; 4379371c9d4SSatish Balay t[2] = b[2 + idx]; 4389371c9d4SSatish Balay t[3] = b[3 + idx]; 4399371c9d4SSatish Balay t[4] = b[4 + idx]; 4409371c9d4SSatish Balay t[5] = b[5 + idx]; 44128e30874SSatish Balay for (i = 1; i < n; i++) { 44228e30874SSatish Balay v = aa + 36 * ai[i]; 44328e30874SSatish Balay vi = aj + ai[i]; 44428e30874SSatish Balay nz = ai[i + 1] - ai[i]; 44528e30874SSatish Balay idx = 6 * r[i]; 4469371c9d4SSatish Balay s1 = b[idx]; 4479371c9d4SSatish Balay s2 = b[1 + idx]; 4489371c9d4SSatish Balay s3 = b[2 + idx]; 4499371c9d4SSatish Balay s4 = b[3 + idx]; 4509371c9d4SSatish Balay s5 = b[4 + idx]; 4519371c9d4SSatish Balay s6 = b[5 + idx]; 45228e30874SSatish Balay for (m = 0; m < nz; m++) { 45328e30874SSatish Balay idx = 6 * vi[m]; 4549371c9d4SSatish Balay x1 = t[idx]; 4559371c9d4SSatish Balay x2 = t[1 + idx]; 4569371c9d4SSatish Balay x3 = t[2 + idx]; 4579371c9d4SSatish Balay x4 = t[3 + idx]; 4589371c9d4SSatish Balay x5 = t[4 + idx]; 4599371c9d4SSatish Balay x6 = t[5 + idx]; 46028e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 46128e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 46228e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 46328e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 46428e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 46528e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 46628e30874SSatish Balay v += 36; 46728e30874SSatish Balay } 46828e30874SSatish Balay idx = 6 * i; 4699371c9d4SSatish Balay t[idx] = s1; 4709371c9d4SSatish Balay t[1 + idx] = s2; 4719371c9d4SSatish Balay t[2 + idx] = s3; 4729371c9d4SSatish Balay t[3 + idx] = s4; 4739371c9d4SSatish Balay t[4 + idx] = s5; 4749371c9d4SSatish Balay t[5 + idx] = s6; 47528e30874SSatish Balay } 47628e30874SSatish Balay /* backward solve the upper triangular */ 47728e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 47828e30874SSatish Balay v = aa + 36 * (adiag[i + 1] + 1); 47928e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 48028e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 48128e30874SSatish Balay idt = 6 * i; 4829371c9d4SSatish Balay s1 = t[idt]; 4839371c9d4SSatish Balay s2 = t[1 + idt]; 4849371c9d4SSatish Balay s3 = t[2 + idt]; 4859371c9d4SSatish Balay s4 = t[3 + idt]; 4869371c9d4SSatish Balay s5 = t[4 + idt]; 4879371c9d4SSatish Balay s6 = t[5 + idt]; 48828e30874SSatish Balay for (m = 0; m < nz; m++) { 48928e30874SSatish Balay idx = 6 * vi[m]; 4909371c9d4SSatish Balay x1 = t[idx]; 4919371c9d4SSatish Balay x2 = t[1 + idx]; 4929371c9d4SSatish Balay x3 = t[2 + idx]; 4939371c9d4SSatish Balay x4 = t[3 + idx]; 4949371c9d4SSatish Balay x5 = t[4 + idx]; 4959371c9d4SSatish Balay x6 = t[5 + idx]; 49628e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 49728e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 49828e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 49928e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 50028e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 50128e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 50228e30874SSatish Balay v += 36; 50328e30874SSatish Balay } 50428e30874SSatish Balay idc = 6 * c[i]; 5059371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 5069371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 5079371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 5089371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 5099371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 5109371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 51128e30874SSatish Balay } 51228e30874SSatish Balay 5139566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 5149566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 5159566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 5169566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 5179566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 5183ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 51928e30874SSatish Balay } 52028e30874SSatish Balay 521d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx) 522d71ae5a4SJacob Faibussowitsch { 52328e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 52428e30874SSatish Balay IS iscol = a->col, isrow = a->row; 52528e30874SSatish Balay const PetscInt *r, *c, *rout, *cout, *diag = a->diag; 52628e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 52728e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 52828e30874SSatish Balay const MatScalar *aa = a->a, *v; 52928e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 53028e30874SSatish Balay const PetscScalar *b; 53128e30874SSatish Balay 53228e30874SSatish Balay PetscFunctionBegin; 5339566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 5349566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 53528e30874SSatish Balay t = a->solve_work; 53628e30874SSatish Balay 5379371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 5389371c9d4SSatish Balay r = rout; 5399371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 5409371c9d4SSatish Balay c = cout + (n - 1); 54128e30874SSatish Balay 54228e30874SSatish Balay /* forward solve the lower triangular */ 54328e30874SSatish Balay idx = 5 * (*r++); 5449371c9d4SSatish Balay t[0] = b[idx]; 5459371c9d4SSatish Balay t[1] = b[1 + idx]; 5469371c9d4SSatish Balay t[2] = b[2 + idx]; 5479371c9d4SSatish Balay t[3] = b[3 + idx]; 5489371c9d4SSatish Balay t[4] = b[4 + idx]; 54928e30874SSatish Balay for (i = 1; i < n; i++) { 55028e30874SSatish Balay v = aa + 25 * ai[i]; 55128e30874SSatish Balay vi = aj + ai[i]; 55228e30874SSatish Balay nz = diag[i] - ai[i]; 55328e30874SSatish Balay idx = 5 * (*r++); 5549371c9d4SSatish Balay s1 = b[idx]; 5559371c9d4SSatish Balay s2 = b[1 + idx]; 5569371c9d4SSatish Balay s3 = b[2 + idx]; 5579371c9d4SSatish Balay s4 = b[3 + idx]; 55828e30874SSatish Balay s5 = b[4 + idx]; 55928e30874SSatish Balay while (nz--) { 56028e30874SSatish Balay idx = 5 * (*vi++); 5619371c9d4SSatish Balay x1 = t[idx]; 5629371c9d4SSatish Balay x2 = t[1 + idx]; 5639371c9d4SSatish Balay x3 = t[2 + idx]; 5649371c9d4SSatish Balay x4 = t[3 + idx]; 5659371c9d4SSatish Balay x5 = t[4 + idx]; 56628e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 56728e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 56828e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 56928e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 57028e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 57128e30874SSatish Balay v += 25; 57228e30874SSatish Balay } 57328e30874SSatish Balay idx = 5 * i; 5749371c9d4SSatish Balay t[idx] = s1; 5759371c9d4SSatish Balay t[1 + idx] = s2; 5769371c9d4SSatish Balay t[2 + idx] = s3; 5779371c9d4SSatish Balay t[3 + idx] = s4; 5789371c9d4SSatish Balay t[4 + idx] = s5; 57928e30874SSatish Balay } 58028e30874SSatish Balay /* backward solve the upper triangular */ 58128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 58228e30874SSatish Balay v = aa + 25 * diag[i] + 25; 58328e30874SSatish Balay vi = aj + diag[i] + 1; 58428e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 58528e30874SSatish Balay idt = 5 * i; 5869371c9d4SSatish Balay s1 = t[idt]; 5879371c9d4SSatish Balay s2 = t[1 + idt]; 5889371c9d4SSatish Balay s3 = t[2 + idt]; 5899371c9d4SSatish Balay s4 = t[3 + idt]; 5909371c9d4SSatish Balay s5 = t[4 + idt]; 59128e30874SSatish Balay while (nz--) { 59228e30874SSatish Balay idx = 5 * (*vi++); 5939371c9d4SSatish Balay x1 = t[idx]; 5949371c9d4SSatish Balay x2 = t[1 + idx]; 5959371c9d4SSatish Balay x3 = t[2 + idx]; 5969371c9d4SSatish Balay x4 = t[3 + idx]; 5979371c9d4SSatish Balay x5 = t[4 + idx]; 59828e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 59928e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 60028e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 60128e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 60228e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 60328e30874SSatish Balay v += 25; 60428e30874SSatish Balay } 60528e30874SSatish Balay idc = 5 * (*c--); 60628e30874SSatish Balay v = aa + 25 * diag[i]; 6079371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 6089371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 6099371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 6109371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 6119371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 61228e30874SSatish Balay } 61328e30874SSatish Balay 6149566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 6159566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 6169566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 6179566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 6189566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 6193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 62028e30874SSatish Balay } 62128e30874SSatish Balay 622d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx) 623d71ae5a4SJacob Faibussowitsch { 62428e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 62528e30874SSatish Balay IS iscol = a->col, isrow = a->row; 62628e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 62728e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 62828e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 62928e30874SSatish Balay const MatScalar *aa = a->a, *v; 63028e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 63128e30874SSatish Balay const PetscScalar *b; 63228e30874SSatish Balay 63328e30874SSatish Balay PetscFunctionBegin; 6349566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 6359566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 63628e30874SSatish Balay t = a->solve_work; 63728e30874SSatish Balay 6389371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 6399371c9d4SSatish Balay r = rout; 6409371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 6419371c9d4SSatish Balay c = cout; 64228e30874SSatish Balay 64328e30874SSatish Balay /* forward solve the lower triangular */ 64428e30874SSatish Balay idx = 5 * r[0]; 6459371c9d4SSatish Balay t[0] = b[idx]; 6469371c9d4SSatish Balay t[1] = b[1 + idx]; 6479371c9d4SSatish Balay t[2] = b[2 + idx]; 6489371c9d4SSatish Balay t[3] = b[3 + idx]; 6499371c9d4SSatish Balay t[4] = b[4 + idx]; 65028e30874SSatish Balay for (i = 1; i < n; i++) { 65128e30874SSatish Balay v = aa + 25 * ai[i]; 65228e30874SSatish Balay vi = aj + ai[i]; 65328e30874SSatish Balay nz = ai[i + 1] - ai[i]; 65428e30874SSatish Balay idx = 5 * r[i]; 6559371c9d4SSatish Balay s1 = b[idx]; 6569371c9d4SSatish Balay s2 = b[1 + idx]; 6579371c9d4SSatish Balay s3 = b[2 + idx]; 6589371c9d4SSatish Balay s4 = b[3 + idx]; 65928e30874SSatish Balay s5 = b[4 + idx]; 66028e30874SSatish Balay for (m = 0; m < nz; m++) { 66128e30874SSatish Balay idx = 5 * vi[m]; 6629371c9d4SSatish Balay x1 = t[idx]; 6639371c9d4SSatish Balay x2 = t[1 + idx]; 6649371c9d4SSatish Balay x3 = t[2 + idx]; 6659371c9d4SSatish Balay x4 = t[3 + idx]; 6669371c9d4SSatish Balay x5 = t[4 + idx]; 66728e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 66828e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 66928e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 67028e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 67128e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 67228e30874SSatish Balay v += 25; 67328e30874SSatish Balay } 67428e30874SSatish Balay idx = 5 * i; 6759371c9d4SSatish Balay t[idx] = s1; 6769371c9d4SSatish Balay t[1 + idx] = s2; 6779371c9d4SSatish Balay t[2 + idx] = s3; 6789371c9d4SSatish Balay t[3 + idx] = s4; 6799371c9d4SSatish Balay t[4 + idx] = s5; 68028e30874SSatish Balay } 68128e30874SSatish Balay /* backward solve the upper triangular */ 68228e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 68328e30874SSatish Balay v = aa + 25 * (adiag[i + 1] + 1); 68428e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 68528e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 68628e30874SSatish Balay idt = 5 * i; 6879371c9d4SSatish Balay s1 = t[idt]; 6889371c9d4SSatish Balay s2 = t[1 + idt]; 6899371c9d4SSatish Balay s3 = t[2 + idt]; 6909371c9d4SSatish Balay s4 = t[3 + idt]; 6919371c9d4SSatish Balay s5 = t[4 + idt]; 69228e30874SSatish Balay for (m = 0; m < nz; m++) { 69328e30874SSatish Balay idx = 5 * vi[m]; 6949371c9d4SSatish Balay x1 = t[idx]; 6959371c9d4SSatish Balay x2 = t[1 + idx]; 6969371c9d4SSatish Balay x3 = t[2 + idx]; 6979371c9d4SSatish Balay x4 = t[3 + idx]; 6989371c9d4SSatish Balay x5 = t[4 + idx]; 69928e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 70028e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 70128e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 70228e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 70328e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 70428e30874SSatish Balay v += 25; 70528e30874SSatish Balay } 70628e30874SSatish Balay idc = 5 * c[i]; 7079371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 7089371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 7099371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 7109371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 7119371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 71228e30874SSatish Balay } 71328e30874SSatish Balay 7149566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 7159566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 7169566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 7179566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 7189566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 7193ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 72028e30874SSatish Balay } 72128e30874SSatish Balay 722d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx) 723d71ae5a4SJacob Faibussowitsch { 72428e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 72528e30874SSatish Balay IS iscol = a->col, isrow = a->row; 72628e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 72728e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 72828e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 72928e30874SSatish Balay const MatScalar *aa = a->a, *v; 73028e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 73128e30874SSatish Balay const PetscScalar *b; 73228e30874SSatish Balay 73328e30874SSatish Balay PetscFunctionBegin; 7349566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 7359566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 73628e30874SSatish Balay t = a->solve_work; 73728e30874SSatish Balay 7389371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 7399371c9d4SSatish Balay r = rout; 7409371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 7419371c9d4SSatish Balay c = cout + (n - 1); 74228e30874SSatish Balay 74328e30874SSatish Balay /* forward solve the lower triangular */ 74428e30874SSatish Balay idx = 4 * (*r++); 7459371c9d4SSatish Balay t[0] = b[idx]; 7469371c9d4SSatish Balay t[1] = b[1 + idx]; 7479371c9d4SSatish Balay t[2] = b[2 + idx]; 7489371c9d4SSatish Balay t[3] = b[3 + idx]; 74928e30874SSatish Balay for (i = 1; i < n; i++) { 75028e30874SSatish Balay v = aa + 16 * ai[i]; 75128e30874SSatish Balay vi = aj + ai[i]; 75228e30874SSatish Balay nz = diag[i] - ai[i]; 75328e30874SSatish Balay idx = 4 * (*r++); 7549371c9d4SSatish Balay s1 = b[idx]; 7559371c9d4SSatish Balay s2 = b[1 + idx]; 7569371c9d4SSatish Balay s3 = b[2 + idx]; 7579371c9d4SSatish Balay s4 = b[3 + idx]; 75828e30874SSatish Balay while (nz--) { 75928e30874SSatish Balay idx = 4 * (*vi++); 7609371c9d4SSatish Balay x1 = t[idx]; 7619371c9d4SSatish Balay x2 = t[1 + idx]; 7629371c9d4SSatish Balay x3 = t[2 + idx]; 7639371c9d4SSatish Balay x4 = t[3 + idx]; 76428e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 76528e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 76628e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 76728e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 76828e30874SSatish Balay v += 16; 76928e30874SSatish Balay } 77028e30874SSatish Balay idx = 4 * i; 7719371c9d4SSatish Balay t[idx] = s1; 7729371c9d4SSatish Balay t[1 + idx] = s2; 7739371c9d4SSatish Balay t[2 + idx] = s3; 7749371c9d4SSatish Balay t[3 + idx] = s4; 77528e30874SSatish Balay } 77628e30874SSatish Balay /* backward solve the upper triangular */ 77728e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 77828e30874SSatish Balay v = aa + 16 * diag[i] + 16; 77928e30874SSatish Balay vi = aj + diag[i] + 1; 78028e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 78128e30874SSatish Balay idt = 4 * i; 7829371c9d4SSatish Balay s1 = t[idt]; 7839371c9d4SSatish Balay s2 = t[1 + idt]; 7849371c9d4SSatish Balay s3 = t[2 + idt]; 7859371c9d4SSatish Balay s4 = t[3 + idt]; 78628e30874SSatish Balay while (nz--) { 78728e30874SSatish Balay idx = 4 * (*vi++); 7889371c9d4SSatish Balay x1 = t[idx]; 7899371c9d4SSatish Balay x2 = t[1 + idx]; 7909371c9d4SSatish Balay x3 = t[2 + idx]; 7919371c9d4SSatish Balay x4 = t[3 + idx]; 79228e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 79328e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 79428e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 79528e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 79628e30874SSatish Balay v += 16; 79728e30874SSatish Balay } 79828e30874SSatish Balay idc = 4 * (*c--); 79928e30874SSatish Balay v = aa + 16 * diag[i]; 80028e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 80128e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 80228e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 80328e30874SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 80428e30874SSatish Balay } 80528e30874SSatish Balay 8069566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 8079566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 8089566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 8099566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 8109566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 8113ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 81228e30874SSatish Balay } 81328e30874SSatish Balay 814d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx) 815d71ae5a4SJacob Faibussowitsch { 81628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 81728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 81828e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 81928e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 82028e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 82128e30874SSatish Balay const MatScalar *aa = a->a, *v; 82228e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 82328e30874SSatish Balay const PetscScalar *b; 82428e30874SSatish Balay 82528e30874SSatish Balay PetscFunctionBegin; 8269566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 8279566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 82828e30874SSatish Balay t = a->solve_work; 82928e30874SSatish Balay 8309371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 8319371c9d4SSatish Balay r = rout; 8329371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 8339371c9d4SSatish Balay c = cout; 83428e30874SSatish Balay 83528e30874SSatish Balay /* forward solve the lower triangular */ 83628e30874SSatish Balay idx = 4 * r[0]; 8379371c9d4SSatish Balay t[0] = b[idx]; 8389371c9d4SSatish Balay t[1] = b[1 + idx]; 8399371c9d4SSatish Balay t[2] = b[2 + idx]; 8409371c9d4SSatish Balay t[3] = b[3 + idx]; 84128e30874SSatish Balay for (i = 1; i < n; i++) { 84228e30874SSatish Balay v = aa + 16 * ai[i]; 84328e30874SSatish Balay vi = aj + ai[i]; 84428e30874SSatish Balay nz = ai[i + 1] - ai[i]; 84528e30874SSatish Balay idx = 4 * r[i]; 8469371c9d4SSatish Balay s1 = b[idx]; 8479371c9d4SSatish Balay s2 = b[1 + idx]; 8489371c9d4SSatish Balay s3 = b[2 + idx]; 8499371c9d4SSatish Balay s4 = b[3 + idx]; 85028e30874SSatish Balay for (m = 0; m < nz; m++) { 85128e30874SSatish Balay idx = 4 * vi[m]; 8529371c9d4SSatish Balay x1 = t[idx]; 8539371c9d4SSatish Balay x2 = t[1 + idx]; 8549371c9d4SSatish Balay x3 = t[2 + idx]; 8559371c9d4SSatish Balay x4 = t[3 + idx]; 85628e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 85728e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 85828e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 85928e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 86028e30874SSatish Balay v += 16; 86128e30874SSatish Balay } 86228e30874SSatish Balay idx = 4 * i; 8639371c9d4SSatish Balay t[idx] = s1; 8649371c9d4SSatish Balay t[1 + idx] = s2; 8659371c9d4SSatish Balay t[2 + idx] = s3; 8669371c9d4SSatish Balay t[3 + idx] = s4; 86728e30874SSatish Balay } 86828e30874SSatish Balay /* backward solve the upper triangular */ 86928e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 87028e30874SSatish Balay v = aa + 16 * (adiag[i + 1] + 1); 87128e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 87228e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 87328e30874SSatish Balay idt = 4 * i; 8749371c9d4SSatish Balay s1 = t[idt]; 8759371c9d4SSatish Balay s2 = t[1 + idt]; 8769371c9d4SSatish Balay s3 = t[2 + idt]; 8779371c9d4SSatish Balay s4 = t[3 + idt]; 87828e30874SSatish Balay for (m = 0; m < nz; m++) { 87928e30874SSatish Balay idx = 4 * vi[m]; 8809371c9d4SSatish Balay x1 = t[idx]; 8819371c9d4SSatish Balay x2 = t[1 + idx]; 8829371c9d4SSatish Balay x3 = t[2 + idx]; 8839371c9d4SSatish Balay x4 = t[3 + idx]; 88428e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 88528e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 88628e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 88728e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 88828e30874SSatish Balay v += 16; 88928e30874SSatish Balay } 89028e30874SSatish Balay idc = 4 * c[i]; 89128e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 89228e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 89328e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 89428e30874SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 89528e30874SSatish Balay } 89628e30874SSatish Balay 8979566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 8989566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 8999566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 9009566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 9019566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 9023ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 90328e30874SSatish Balay } 90428e30874SSatish Balay 905d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx) 906d71ae5a4SJacob Faibussowitsch { 90728e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 90828e30874SSatish Balay IS iscol = a->col, isrow = a->row; 90928e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 91028e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 91128e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 91228e30874SSatish Balay const MatScalar *aa = a->a, *v; 91328e30874SSatish Balay MatScalar s1, s2, s3, s4, x1, x2, x3, x4, *t; 91428e30874SSatish Balay PetscScalar *x; 91528e30874SSatish Balay const PetscScalar *b; 91628e30874SSatish Balay 91728e30874SSatish Balay PetscFunctionBegin; 9189566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 9199566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 92028e30874SSatish Balay t = (MatScalar *)a->solve_work; 92128e30874SSatish Balay 9229371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 9239371c9d4SSatish Balay r = rout; 9249371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 9259371c9d4SSatish Balay c = cout + (n - 1); 92628e30874SSatish Balay 92728e30874SSatish Balay /* forward solve the lower triangular */ 92828e30874SSatish Balay idx = 4 * (*r++); 92928e30874SSatish Balay t[0] = (MatScalar)b[idx]; 93028e30874SSatish Balay t[1] = (MatScalar)b[1 + idx]; 93128e30874SSatish Balay t[2] = (MatScalar)b[2 + idx]; 93228e30874SSatish Balay t[3] = (MatScalar)b[3 + idx]; 93328e30874SSatish Balay for (i = 1; i < n; i++) { 93428e30874SSatish Balay v = aa + 16 * ai[i]; 93528e30874SSatish Balay vi = aj + ai[i]; 93628e30874SSatish Balay nz = diag[i] - ai[i]; 93728e30874SSatish Balay idx = 4 * (*r++); 93828e30874SSatish Balay s1 = (MatScalar)b[idx]; 93928e30874SSatish Balay s2 = (MatScalar)b[1 + idx]; 94028e30874SSatish Balay s3 = (MatScalar)b[2 + idx]; 94128e30874SSatish Balay s4 = (MatScalar)b[3 + idx]; 94228e30874SSatish Balay while (nz--) { 94328e30874SSatish Balay idx = 4 * (*vi++); 94428e30874SSatish Balay x1 = t[idx]; 94528e30874SSatish Balay x2 = t[1 + idx]; 94628e30874SSatish Balay x3 = t[2 + idx]; 94728e30874SSatish Balay x4 = t[3 + idx]; 94828e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 94928e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 95028e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 95128e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 95228e30874SSatish Balay v += 16; 95328e30874SSatish Balay } 95428e30874SSatish Balay idx = 4 * i; 95528e30874SSatish Balay t[idx] = s1; 95628e30874SSatish Balay t[1 + idx] = s2; 95728e30874SSatish Balay t[2 + idx] = s3; 95828e30874SSatish Balay t[3 + idx] = s4; 95928e30874SSatish Balay } 96028e30874SSatish Balay /* backward solve the upper triangular */ 96128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 96228e30874SSatish Balay v = aa + 16 * diag[i] + 16; 96328e30874SSatish Balay vi = aj + diag[i] + 1; 96428e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 96528e30874SSatish Balay idt = 4 * i; 96628e30874SSatish Balay s1 = t[idt]; 96728e30874SSatish Balay s2 = t[1 + idt]; 96828e30874SSatish Balay s3 = t[2 + idt]; 96928e30874SSatish Balay s4 = t[3 + idt]; 97028e30874SSatish Balay while (nz--) { 97128e30874SSatish Balay idx = 4 * (*vi++); 97228e30874SSatish Balay x1 = t[idx]; 97328e30874SSatish Balay x2 = t[1 + idx]; 97428e30874SSatish Balay x3 = t[2 + idx]; 97528e30874SSatish Balay x4 = t[3 + idx]; 97628e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 97728e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 97828e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 97928e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 98028e30874SSatish Balay v += 16; 98128e30874SSatish Balay } 98228e30874SSatish Balay idc = 4 * (*c--); 98328e30874SSatish Balay v = aa + 16 * diag[i]; 98428e30874SSatish Balay t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 98528e30874SSatish Balay t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 98628e30874SSatish Balay t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 98728e30874SSatish Balay t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 98828e30874SSatish Balay x[idc] = (PetscScalar)t[idt]; 98928e30874SSatish Balay x[1 + idc] = (PetscScalar)t[1 + idt]; 99028e30874SSatish Balay x[2 + idc] = (PetscScalar)t[2 + idt]; 99128e30874SSatish Balay x[3 + idc] = (PetscScalar)t[3 + idt]; 99228e30874SSatish Balay } 99328e30874SSatish Balay 9949566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 9959566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 9969566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 9979566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 9989566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 9993ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 100028e30874SSatish Balay } 100128e30874SSatish Balay 100228e30874SSatish Balay #if defined(PETSC_HAVE_SSE) 100328e30874SSatish Balay 100428e30874SSatish Balay #include PETSC_HAVE_SSE 100528e30874SSatish Balay 1006d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx) 1007d71ae5a4SJacob Faibussowitsch { 100828e30874SSatish Balay /* 100928e30874SSatish Balay Note: This code uses demotion of double 101028e30874SSatish Balay to float when performing the mixed-mode computation. 101128e30874SSatish Balay This may not be numerically reasonable for all applications. 101228e30874SSatish Balay */ 101328e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 101428e30874SSatish Balay IS iscol = a->col, isrow = a->row; 101528e30874SSatish Balay PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16; 101628e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 101728e30874SSatish Balay MatScalar *aa = a->a, *v; 101828e30874SSatish Balay PetscScalar *x, *b, *t; 101928e30874SSatish Balay 102028e30874SSatish Balay /* Make space in temp stack for 16 Byte Aligned arrays */ 102128e30874SSatish Balay float ssealignedspace[11], *tmps, *tmpx; 102228e30874SSatish Balay unsigned long offset; 102328e30874SSatish Balay 102428e30874SSatish Balay PetscFunctionBegin; 102528e30874SSatish Balay SSE_SCOPE_BEGIN; 102628e30874SSatish Balay 102728e30874SSatish Balay offset = (unsigned long)ssealignedspace % 16; 102828e30874SSatish Balay if (offset) offset = (16 - offset) / 4; 102928e30874SSatish Balay tmps = &ssealignedspace[offset]; 103028e30874SSatish Balay tmpx = &ssealignedspace[offset + 4]; 103128e30874SSatish Balay PREFETCH_NTA(aa + 16 * ai[1]); 103228e30874SSatish Balay 10339566063dSJacob Faibussowitsch PetscCall(VecGetArray(bb, &b)); 10349566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 103528e30874SSatish Balay t = a->solve_work; 103628e30874SSatish Balay 10379371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 10389371c9d4SSatish Balay r = rout; 10399371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 10409371c9d4SSatish Balay c = cout + (n - 1); 104128e30874SSatish Balay 104228e30874SSatish Balay /* forward solve the lower triangular */ 104328e30874SSatish Balay idx = 4 * (*r++); 10449371c9d4SSatish Balay t[0] = b[idx]; 10459371c9d4SSatish Balay t[1] = b[1 + idx]; 10469371c9d4SSatish Balay t[2] = b[2 + idx]; 10479371c9d4SSatish Balay t[3] = b[3 + idx]; 104828e30874SSatish Balay v = aa + 16 * ai[1]; 104928e30874SSatish Balay 105028e30874SSatish Balay for (i = 1; i < n;) { 105128e30874SSatish Balay PREFETCH_NTA(&v[8]); 105228e30874SSatish Balay vi = aj + ai[i]; 105328e30874SSatish Balay nz = diag[i] - ai[i]; 105428e30874SSatish Balay idx = 4 * (*r++); 105528e30874SSatish Balay 105628e30874SSatish Balay /* Demote sum from double to float */ 105728e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]); 105828e30874SSatish Balay LOAD_PS(tmps, XMM7); 105928e30874SSatish Balay 106028e30874SSatish Balay while (nz--) { 106128e30874SSatish Balay PREFETCH_NTA(&v[16]); 106228e30874SSatish Balay idx = 4 * (*vi++); 106328e30874SSatish Balay 106428e30874SSatish Balay /* Demote solution (so far) from double to float */ 106528e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]); 106628e30874SSatish Balay 106728e30874SSatish Balay /* 4x4 Matrix-Vector product with negative accumulation: */ 106828e30874SSatish Balay SSE_INLINE_BEGIN_2(tmpx, v) 106928e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 107028e30874SSatish Balay 107128e30874SSatish Balay /* First Column */ 107228e30874SSatish Balay SSE_COPY_PS(XMM0, XMM6) 107328e30874SSatish Balay SSE_SHUFFLE(XMM0, XMM0, 0x00) 107428e30874SSatish Balay SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 107528e30874SSatish Balay SSE_SUB_PS(XMM7, XMM0) 107628e30874SSatish Balay 107728e30874SSatish Balay /* Second Column */ 107828e30874SSatish Balay SSE_COPY_PS(XMM1, XMM6) 107928e30874SSatish Balay SSE_SHUFFLE(XMM1, XMM1, 0x55) 108028e30874SSatish Balay SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 108128e30874SSatish Balay SSE_SUB_PS(XMM7, XMM1) 108228e30874SSatish Balay 108328e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 108428e30874SSatish Balay 108528e30874SSatish Balay /* Third Column */ 108628e30874SSatish Balay SSE_COPY_PS(XMM2, XMM6) 108728e30874SSatish Balay SSE_SHUFFLE(XMM2, XMM2, 0xAA) 108828e30874SSatish Balay SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 108928e30874SSatish Balay SSE_SUB_PS(XMM7, XMM2) 109028e30874SSatish Balay 109128e30874SSatish Balay /* Fourth Column */ 109228e30874SSatish Balay SSE_COPY_PS(XMM3, XMM6) 109328e30874SSatish Balay SSE_SHUFFLE(XMM3, XMM3, 0xFF) 109428e30874SSatish Balay SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 109528e30874SSatish Balay SSE_SUB_PS(XMM7, XMM3) 109628e30874SSatish Balay SSE_INLINE_END_2 109728e30874SSatish Balay 109828e30874SSatish Balay v += 16; 109928e30874SSatish Balay } 110028e30874SSatish Balay idx = 4 * i; 110128e30874SSatish Balay v = aa + 16 * ai[++i]; 110228e30874SSatish Balay PREFETCH_NTA(v); 110328e30874SSatish Balay STORE_PS(tmps, XMM7); 110428e30874SSatish Balay 110528e30874SSatish Balay /* Promote result from float to double */ 110628e30874SSatish Balay CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps); 110728e30874SSatish Balay } 110828e30874SSatish Balay /* backward solve the upper triangular */ 110928e30874SSatish Balay idt = 4 * (n - 1); 111028e30874SSatish Balay ai16 = 16 * diag[n - 1]; 111128e30874SSatish Balay v = aa + ai16 + 16; 111228e30874SSatish Balay for (i = n - 1; i >= 0;) { 111328e30874SSatish Balay PREFETCH_NTA(&v[8]); 111428e30874SSatish Balay vi = aj + diag[i] + 1; 111528e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 111628e30874SSatish Balay 111728e30874SSatish Balay /* Demote accumulator from double to float */ 111828e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]); 111928e30874SSatish Balay LOAD_PS(tmps, XMM7); 112028e30874SSatish Balay 112128e30874SSatish Balay while (nz--) { 112228e30874SSatish Balay PREFETCH_NTA(&v[16]); 112328e30874SSatish Balay idx = 4 * (*vi++); 112428e30874SSatish Balay 112528e30874SSatish Balay /* Demote solution (so far) from double to float */ 112628e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]); 112728e30874SSatish Balay 112828e30874SSatish Balay /* 4x4 Matrix-Vector Product with negative accumulation: */ 112928e30874SSatish Balay SSE_INLINE_BEGIN_2(tmpx, v) 113028e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 113128e30874SSatish Balay 113228e30874SSatish Balay /* First Column */ 113328e30874SSatish Balay SSE_COPY_PS(XMM0, XMM6) 113428e30874SSatish Balay SSE_SHUFFLE(XMM0, XMM0, 0x00) 113528e30874SSatish Balay SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 113628e30874SSatish Balay SSE_SUB_PS(XMM7, XMM0) 113728e30874SSatish Balay 113828e30874SSatish Balay /* Second Column */ 113928e30874SSatish Balay SSE_COPY_PS(XMM1, XMM6) 114028e30874SSatish Balay SSE_SHUFFLE(XMM1, XMM1, 0x55) 114128e30874SSatish Balay SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 114228e30874SSatish Balay SSE_SUB_PS(XMM7, XMM1) 114328e30874SSatish Balay 114428e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 114528e30874SSatish Balay 114628e30874SSatish Balay /* Third Column */ 114728e30874SSatish Balay SSE_COPY_PS(XMM2, XMM6) 114828e30874SSatish Balay SSE_SHUFFLE(XMM2, XMM2, 0xAA) 114928e30874SSatish Balay SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 115028e30874SSatish Balay SSE_SUB_PS(XMM7, XMM2) 115128e30874SSatish Balay 115228e30874SSatish Balay /* Fourth Column */ 115328e30874SSatish Balay SSE_COPY_PS(XMM3, XMM6) 115428e30874SSatish Balay SSE_SHUFFLE(XMM3, XMM3, 0xFF) 115528e30874SSatish Balay SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 115628e30874SSatish Balay SSE_SUB_PS(XMM7, XMM3) 115728e30874SSatish Balay SSE_INLINE_END_2 115828e30874SSatish Balay v += 16; 115928e30874SSatish Balay } 116028e30874SSatish Balay v = aa + ai16; 116128e30874SSatish Balay ai16 = 16 * diag[--i]; 116228e30874SSatish Balay PREFETCH_NTA(aa + ai16 + 16); 116328e30874SSatish Balay /* 116428e30874SSatish Balay Scale the result by the diagonal 4x4 block, 116528e30874SSatish Balay which was inverted as part of the factorization 116628e30874SSatish Balay */ 116728e30874SSatish Balay SSE_INLINE_BEGIN_3(v, tmps, aa + ai16) 116828e30874SSatish Balay /* First Column */ 116928e30874SSatish Balay SSE_COPY_PS(XMM0, XMM7) 117028e30874SSatish Balay SSE_SHUFFLE(XMM0, XMM0, 0x00) 117128e30874SSatish Balay SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 117228e30874SSatish Balay 117328e30874SSatish Balay /* Second Column */ 117428e30874SSatish Balay SSE_COPY_PS(XMM1, XMM7) 117528e30874SSatish Balay SSE_SHUFFLE(XMM1, XMM1, 0x55) 117628e30874SSatish Balay SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 117728e30874SSatish Balay SSE_ADD_PS(XMM0, XMM1) 117828e30874SSatish Balay 117928e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 118028e30874SSatish Balay 118128e30874SSatish Balay /* Third Column */ 118228e30874SSatish Balay SSE_COPY_PS(XMM2, XMM7) 118328e30874SSatish Balay SSE_SHUFFLE(XMM2, XMM2, 0xAA) 118428e30874SSatish Balay SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 118528e30874SSatish Balay SSE_ADD_PS(XMM0, XMM2) 118628e30874SSatish Balay 118728e30874SSatish Balay /* Fourth Column */ 118828e30874SSatish Balay SSE_COPY_PS(XMM3, XMM7) 118928e30874SSatish Balay SSE_SHUFFLE(XMM3, XMM3, 0xFF) 119028e30874SSatish Balay SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 119128e30874SSatish Balay SSE_ADD_PS(XMM0, XMM3) 119228e30874SSatish Balay 119328e30874SSatish Balay SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 119428e30874SSatish Balay SSE_INLINE_END_3 119528e30874SSatish Balay 119628e30874SSatish Balay /* Promote solution from float to double */ 119728e30874SSatish Balay CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps); 119828e30874SSatish Balay 119928e30874SSatish Balay /* Apply reordering to t and stream into x. */ 120028e30874SSatish Balay /* This way, x doesn't pollute the cache. */ 120128e30874SSatish Balay /* Be careful with size: 2 doubles = 4 floats! */ 120228e30874SSatish Balay idc = 4 * (*c--); 120328e30874SSatish Balay SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc]) 120428e30874SSatish Balay /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 120528e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0) 120628e30874SSatish Balay SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0) 120728e30874SSatish Balay /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 120828e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1) 120928e30874SSatish Balay SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1) 121028e30874SSatish Balay SSE_INLINE_END_2 121128e30874SSatish Balay v = aa + ai16 + 16; 121228e30874SSatish Balay idt -= 4; 121328e30874SSatish Balay } 121428e30874SSatish Balay 12159566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 12169566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 12179566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(bb, &b)); 12189566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 12199566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 122028e30874SSatish Balay SSE_SCOPE_END; 12213ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 122228e30874SSatish Balay } 122328e30874SSatish Balay 122428e30874SSatish Balay #endif 122528e30874SSatish Balay 1226d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx) 1227d71ae5a4SJacob Faibussowitsch { 122828e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 122928e30874SSatish Balay IS iscol = a->col, isrow = a->row; 123028e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 123128e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 123228e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 123328e30874SSatish Balay const MatScalar *aa = a->a, *v; 123428e30874SSatish Balay PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 123528e30874SSatish Balay const PetscScalar *b; 123628e30874SSatish Balay 123728e30874SSatish Balay PetscFunctionBegin; 12389566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 12399566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 124028e30874SSatish Balay t = a->solve_work; 124128e30874SSatish Balay 12429371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 12439371c9d4SSatish Balay r = rout; 12449371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 12459371c9d4SSatish Balay c = cout + (n - 1); 124628e30874SSatish Balay 124728e30874SSatish Balay /* forward solve the lower triangular */ 124828e30874SSatish Balay idx = 3 * (*r++); 12499371c9d4SSatish Balay t[0] = b[idx]; 12509371c9d4SSatish Balay t[1] = b[1 + idx]; 12519371c9d4SSatish Balay t[2] = b[2 + idx]; 125228e30874SSatish Balay for (i = 1; i < n; i++) { 125328e30874SSatish Balay v = aa + 9 * ai[i]; 125428e30874SSatish Balay vi = aj + ai[i]; 125528e30874SSatish Balay nz = diag[i] - ai[i]; 125628e30874SSatish Balay idx = 3 * (*r++); 12579371c9d4SSatish Balay s1 = b[idx]; 12589371c9d4SSatish Balay s2 = b[1 + idx]; 12599371c9d4SSatish Balay s3 = b[2 + idx]; 126028e30874SSatish Balay while (nz--) { 126128e30874SSatish Balay idx = 3 * (*vi++); 12629371c9d4SSatish Balay x1 = t[idx]; 12639371c9d4SSatish Balay x2 = t[1 + idx]; 12649371c9d4SSatish Balay x3 = t[2 + idx]; 126528e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 126628e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 126728e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 126828e30874SSatish Balay v += 9; 126928e30874SSatish Balay } 127028e30874SSatish Balay idx = 3 * i; 12719371c9d4SSatish Balay t[idx] = s1; 12729371c9d4SSatish Balay t[1 + idx] = s2; 12739371c9d4SSatish Balay t[2 + idx] = s3; 127428e30874SSatish Balay } 127528e30874SSatish Balay /* backward solve the upper triangular */ 127628e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 127728e30874SSatish Balay v = aa + 9 * diag[i] + 9; 127828e30874SSatish Balay vi = aj + diag[i] + 1; 127928e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 128028e30874SSatish Balay idt = 3 * i; 12819371c9d4SSatish Balay s1 = t[idt]; 12829371c9d4SSatish Balay s2 = t[1 + idt]; 12839371c9d4SSatish Balay s3 = t[2 + idt]; 128428e30874SSatish Balay while (nz--) { 128528e30874SSatish Balay idx = 3 * (*vi++); 12869371c9d4SSatish Balay x1 = t[idx]; 12879371c9d4SSatish Balay x2 = t[1 + idx]; 12889371c9d4SSatish Balay x3 = t[2 + idx]; 128928e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 129028e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 129128e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 129228e30874SSatish Balay v += 9; 129328e30874SSatish Balay } 129428e30874SSatish Balay idc = 3 * (*c--); 129528e30874SSatish Balay v = aa + 9 * diag[i]; 129628e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 129728e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 129828e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 129928e30874SSatish Balay } 13009566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 13019566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 13029566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 13039566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 13049566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 13053ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 130628e30874SSatish Balay } 130728e30874SSatish Balay 1308d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx) 1309d71ae5a4SJacob Faibussowitsch { 131028e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 131128e30874SSatish Balay IS iscol = a->col, isrow = a->row; 131228e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 131328e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 131428e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 131528e30874SSatish Balay const MatScalar *aa = a->a, *v; 131628e30874SSatish Balay PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 131728e30874SSatish Balay const PetscScalar *b; 131828e30874SSatish Balay 131928e30874SSatish Balay PetscFunctionBegin; 13209566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 13219566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 132228e30874SSatish Balay t = a->solve_work; 132328e30874SSatish Balay 13249371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 13259371c9d4SSatish Balay r = rout; 13269371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 13279371c9d4SSatish Balay c = cout; 132828e30874SSatish Balay 132928e30874SSatish Balay /* forward solve the lower triangular */ 133028e30874SSatish Balay idx = 3 * r[0]; 13319371c9d4SSatish Balay t[0] = b[idx]; 13329371c9d4SSatish Balay t[1] = b[1 + idx]; 13339371c9d4SSatish Balay t[2] = b[2 + idx]; 133428e30874SSatish Balay for (i = 1; i < n; i++) { 133528e30874SSatish Balay v = aa + 9 * ai[i]; 133628e30874SSatish Balay vi = aj + ai[i]; 133728e30874SSatish Balay nz = ai[i + 1] - ai[i]; 133828e30874SSatish Balay idx = 3 * r[i]; 13399371c9d4SSatish Balay s1 = b[idx]; 13409371c9d4SSatish Balay s2 = b[1 + idx]; 13419371c9d4SSatish Balay s3 = b[2 + idx]; 134228e30874SSatish Balay for (m = 0; m < nz; m++) { 134328e30874SSatish Balay idx = 3 * vi[m]; 13449371c9d4SSatish Balay x1 = t[idx]; 13459371c9d4SSatish Balay x2 = t[1 + idx]; 13469371c9d4SSatish Balay x3 = t[2 + idx]; 134728e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 134828e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 134928e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 135028e30874SSatish Balay v += 9; 135128e30874SSatish Balay } 135228e30874SSatish Balay idx = 3 * i; 13539371c9d4SSatish Balay t[idx] = s1; 13549371c9d4SSatish Balay t[1 + idx] = s2; 13559371c9d4SSatish Balay t[2 + idx] = s3; 135628e30874SSatish Balay } 135728e30874SSatish Balay /* backward solve the upper triangular */ 135828e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 135928e30874SSatish Balay v = aa + 9 * (adiag[i + 1] + 1); 136028e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 136128e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 136228e30874SSatish Balay idt = 3 * i; 13639371c9d4SSatish Balay s1 = t[idt]; 13649371c9d4SSatish Balay s2 = t[1 + idt]; 13659371c9d4SSatish Balay s3 = t[2 + idt]; 136628e30874SSatish Balay for (m = 0; m < nz; m++) { 136728e30874SSatish Balay idx = 3 * vi[m]; 13689371c9d4SSatish Balay x1 = t[idx]; 13699371c9d4SSatish Balay x2 = t[1 + idx]; 13709371c9d4SSatish Balay x3 = t[2 + idx]; 137128e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 137228e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 137328e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 137428e30874SSatish Balay v += 9; 137528e30874SSatish Balay } 137628e30874SSatish Balay idc = 3 * c[i]; 137728e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 137828e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 137928e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 138028e30874SSatish Balay } 13819566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 13829566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 13839566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 13849566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 13859566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 13863ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 138728e30874SSatish Balay } 138828e30874SSatish Balay 1389d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx) 1390d71ae5a4SJacob Faibussowitsch { 139128e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 139228e30874SSatish Balay IS iscol = a->col, isrow = a->row; 139328e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 139428e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 139528e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 139628e30874SSatish Balay const MatScalar *aa = a->a, *v; 139728e30874SSatish Balay PetscScalar *x, s1, s2, x1, x2, *t; 139828e30874SSatish Balay const PetscScalar *b; 139928e30874SSatish Balay 140028e30874SSatish Balay PetscFunctionBegin; 14019566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 14029566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 140328e30874SSatish Balay t = a->solve_work; 140428e30874SSatish Balay 14059371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 14069371c9d4SSatish Balay r = rout; 14079371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 14089371c9d4SSatish Balay c = cout + (n - 1); 140928e30874SSatish Balay 141028e30874SSatish Balay /* forward solve the lower triangular */ 141128e30874SSatish Balay idx = 2 * (*r++); 14129371c9d4SSatish Balay t[0] = b[idx]; 14139371c9d4SSatish Balay t[1] = b[1 + idx]; 141428e30874SSatish Balay for (i = 1; i < n; i++) { 141528e30874SSatish Balay v = aa + 4 * ai[i]; 141628e30874SSatish Balay vi = aj + ai[i]; 141728e30874SSatish Balay nz = diag[i] - ai[i]; 141828e30874SSatish Balay idx = 2 * (*r++); 14199371c9d4SSatish Balay s1 = b[idx]; 14209371c9d4SSatish Balay s2 = b[1 + idx]; 142128e30874SSatish Balay while (nz--) { 142228e30874SSatish Balay idx = 2 * (*vi++); 14239371c9d4SSatish Balay x1 = t[idx]; 14249371c9d4SSatish Balay x2 = t[1 + idx]; 142528e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 142628e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 142728e30874SSatish Balay v += 4; 142828e30874SSatish Balay } 142928e30874SSatish Balay idx = 2 * i; 14309371c9d4SSatish Balay t[idx] = s1; 14319371c9d4SSatish Balay t[1 + idx] = s2; 143228e30874SSatish Balay } 143328e30874SSatish Balay /* backward solve the upper triangular */ 143428e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 143528e30874SSatish Balay v = aa + 4 * diag[i] + 4; 143628e30874SSatish Balay vi = aj + diag[i] + 1; 143728e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 143828e30874SSatish Balay idt = 2 * i; 14399371c9d4SSatish Balay s1 = t[idt]; 14409371c9d4SSatish Balay s2 = t[1 + idt]; 144128e30874SSatish Balay while (nz--) { 144228e30874SSatish Balay idx = 2 * (*vi++); 14439371c9d4SSatish Balay x1 = t[idx]; 14449371c9d4SSatish Balay x2 = t[1 + idx]; 144528e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 144628e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 144728e30874SSatish Balay v += 4; 144828e30874SSatish Balay } 144928e30874SSatish Balay idc = 2 * (*c--); 145028e30874SSatish Balay v = aa + 4 * diag[i]; 145128e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 145228e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 145328e30874SSatish Balay } 14549566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 14559566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 14569566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 14579566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 14589566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 14593ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 146028e30874SSatish Balay } 146128e30874SSatish Balay 1462d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx) 1463d71ae5a4SJacob Faibussowitsch { 146428e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 146528e30874SSatish Balay IS iscol = a->col, isrow = a->row; 146628e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 146728e30874SSatish Balay PetscInt i, nz, idx, jdx, idt, idc, m; 146828e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 146928e30874SSatish Balay const MatScalar *aa = a->a, *v; 147028e30874SSatish Balay PetscScalar *x, s1, s2, x1, x2, *t; 147128e30874SSatish Balay const PetscScalar *b; 147228e30874SSatish Balay 147328e30874SSatish Balay PetscFunctionBegin; 14749566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 14759566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 147628e30874SSatish Balay t = a->solve_work; 147728e30874SSatish Balay 14789371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 14799371c9d4SSatish Balay r = rout; 14809371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 14819371c9d4SSatish Balay c = cout; 148228e30874SSatish Balay 148328e30874SSatish Balay /* forward solve the lower triangular */ 148428e30874SSatish Balay idx = 2 * r[0]; 14859371c9d4SSatish Balay t[0] = b[idx]; 14869371c9d4SSatish Balay t[1] = b[1 + idx]; 148728e30874SSatish Balay for (i = 1; i < n; i++) { 148828e30874SSatish Balay v = aa + 4 * ai[i]; 148928e30874SSatish Balay vi = aj + ai[i]; 149028e30874SSatish Balay nz = ai[i + 1] - ai[i]; 149128e30874SSatish Balay idx = 2 * r[i]; 14929371c9d4SSatish Balay s1 = b[idx]; 14939371c9d4SSatish Balay s2 = b[1 + idx]; 149428e30874SSatish Balay for (m = 0; m < nz; m++) { 149528e30874SSatish Balay jdx = 2 * vi[m]; 14969371c9d4SSatish Balay x1 = t[jdx]; 14979371c9d4SSatish Balay x2 = t[1 + jdx]; 149828e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 149928e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 150028e30874SSatish Balay v += 4; 150128e30874SSatish Balay } 150228e30874SSatish Balay idx = 2 * i; 15039371c9d4SSatish Balay t[idx] = s1; 15049371c9d4SSatish Balay t[1 + idx] = s2; 150528e30874SSatish Balay } 150628e30874SSatish Balay /* backward solve the upper triangular */ 150728e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 150828e30874SSatish Balay v = aa + 4 * (adiag[i + 1] + 1); 150928e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 151028e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 151128e30874SSatish Balay idt = 2 * i; 15129371c9d4SSatish Balay s1 = t[idt]; 15139371c9d4SSatish Balay s2 = t[1 + idt]; 151428e30874SSatish Balay for (m = 0; m < nz; m++) { 151528e30874SSatish Balay idx = 2 * vi[m]; 15169371c9d4SSatish Balay x1 = t[idx]; 15179371c9d4SSatish Balay x2 = t[1 + idx]; 151828e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 151928e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 152028e30874SSatish Balay v += 4; 152128e30874SSatish Balay } 152228e30874SSatish Balay idc = 2 * c[i]; 152328e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 152428e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 152528e30874SSatish Balay } 15269566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 15279566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 15289566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 15299566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 15309566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 15313ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 153228e30874SSatish Balay } 153328e30874SSatish Balay 1534d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx) 1535d71ae5a4SJacob Faibussowitsch { 153628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 153728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 153828e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 153928e30874SSatish Balay PetscInt i, nz; 154028e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 154128e30874SSatish Balay const MatScalar *aa = a->a, *v; 154228e30874SSatish Balay PetscScalar *x, s1, *t; 154328e30874SSatish Balay const PetscScalar *b; 154428e30874SSatish Balay 154528e30874SSatish Balay PetscFunctionBegin; 15463ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 154728e30874SSatish Balay 15489566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 15499566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 155028e30874SSatish Balay t = a->solve_work; 155128e30874SSatish Balay 15529371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 15539371c9d4SSatish Balay r = rout; 15549371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 15559371c9d4SSatish Balay c = cout + (n - 1); 155628e30874SSatish Balay 155728e30874SSatish Balay /* forward solve the lower triangular */ 155828e30874SSatish Balay t[0] = b[*r++]; 155928e30874SSatish Balay for (i = 1; i < n; i++) { 156028e30874SSatish Balay v = aa + ai[i]; 156128e30874SSatish Balay vi = aj + ai[i]; 156228e30874SSatish Balay nz = diag[i] - ai[i]; 156328e30874SSatish Balay s1 = b[*r++]; 1564ad540459SPierre Jolivet while (nz--) s1 -= (*v++) * t[*vi++]; 156528e30874SSatish Balay t[i] = s1; 156628e30874SSatish Balay } 156728e30874SSatish Balay /* backward solve the upper triangular */ 156828e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 156928e30874SSatish Balay v = aa + diag[i] + 1; 157028e30874SSatish Balay vi = aj + diag[i] + 1; 157128e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 157228e30874SSatish Balay s1 = t[i]; 1573ad540459SPierre Jolivet while (nz--) s1 -= (*v++) * t[*vi++]; 157428e30874SSatish Balay x[*c--] = t[i] = aa[diag[i]] * s1; 157528e30874SSatish Balay } 157628e30874SSatish Balay 15779566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 15789566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 15799566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 15809566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 15819566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n)); 15823ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 158328e30874SSatish Balay } 158428e30874SSatish Balay 1585d71ae5a4SJacob Faibussowitsch PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx) 1586d71ae5a4SJacob Faibussowitsch { 158728e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 158828e30874SSatish Balay IS iscol = a->col, isrow = a->row; 158928e30874SSatish Balay PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz; 159028e30874SSatish Balay const PetscInt *rout, *cout, *r, *c; 159128e30874SSatish Balay PetscScalar *x, *tmp, sum; 159228e30874SSatish Balay const PetscScalar *b; 159328e30874SSatish Balay const MatScalar *aa = a->a, *v; 159428e30874SSatish Balay 159528e30874SSatish Balay PetscFunctionBegin; 15963ba16761SJacob Faibussowitsch if (!n) PetscFunctionReturn(PETSC_SUCCESS); 159728e30874SSatish Balay 15989566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 15999566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 160028e30874SSatish Balay tmp = a->solve_work; 160128e30874SSatish Balay 16029371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 16039371c9d4SSatish Balay r = rout; 16049371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 16059371c9d4SSatish Balay c = cout; 160628e30874SSatish Balay 160728e30874SSatish Balay /* forward solve the lower triangular */ 160828e30874SSatish Balay tmp[0] = b[r[0]]; 160928e30874SSatish Balay v = aa; 161028e30874SSatish Balay vi = aj; 161128e30874SSatish Balay for (i = 1; i < n; i++) { 161228e30874SSatish Balay nz = ai[i + 1] - ai[i]; 161328e30874SSatish Balay sum = b[r[i]]; 161428e30874SSatish Balay PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 161528e30874SSatish Balay tmp[i] = sum; 16169371c9d4SSatish Balay v += nz; 16179371c9d4SSatish Balay vi += nz; 161828e30874SSatish Balay } 161928e30874SSatish Balay 162028e30874SSatish Balay /* backward solve the upper triangular */ 162128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 162228e30874SSatish Balay v = aa + adiag[i + 1] + 1; 162328e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 162428e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 162528e30874SSatish Balay sum = tmp[i]; 162628e30874SSatish Balay PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 162728e30874SSatish Balay x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */ 162828e30874SSatish Balay } 162928e30874SSatish Balay 16309566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 16319566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 16329566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 16339566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 16349566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 16353ba16761SJacob Faibussowitsch PetscFunctionReturn(PETSC_SUCCESS); 163628e30874SSatish Balay } 1637