128e30874SSatish Balay #include <../src/mat/impls/baij/seq/baij.h> 2af0996ceSBarry Smith #include <petsc/private/kernels/blockinvert.h> 328e30874SSatish Balay 4*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A, Vec bb, Vec xx) { 528e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 628e30874SSatish Balay IS iscol = a->col, isrow = a->row; 728e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 828e30874SSatish Balay const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *vi; 928e30874SSatish Balay PetscInt i, nz; 1028e30874SSatish Balay const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 1128e30874SSatish Balay const MatScalar *aa = a->a, *v; 1228e30874SSatish Balay PetscScalar *x, *s, *t, *ls; 1328e30874SSatish Balay const PetscScalar *b; 1428e30874SSatish Balay 1528e30874SSatish Balay PetscFunctionBegin; 169566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 179566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 1828e30874SSatish Balay t = a->solve_work; 1928e30874SSatish Balay 20*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 21*9371c9d4SSatish Balay r = rout; 22*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 23*9371c9d4SSatish Balay c = cout + (n - 1); 2428e30874SSatish Balay 2528e30874SSatish Balay /* forward solve the lower triangular */ 269566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(t, b + bs * (*r++), bs)); 2728e30874SSatish Balay for (i = 1; i < n; i++) { 2828e30874SSatish Balay v = aa + bs2 * ai[i]; 2928e30874SSatish Balay vi = aj + ai[i]; 3028e30874SSatish Balay nz = a->diag[i] - ai[i]; 3128e30874SSatish Balay s = t + bs * i; 329566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(s, b + bs * (*r++), bs)); 3328e30874SSatish Balay while (nz--) { 3428e30874SSatish Balay PetscKernel_v_gets_v_minus_A_times_w(bs, s, v, t + bs * (*vi++)); 3528e30874SSatish Balay v += bs2; 3628e30874SSatish Balay } 3728e30874SSatish Balay } 3828e30874SSatish Balay /* backward solve the upper triangular */ 3928e30874SSatish Balay ls = a->solve_work + A->cmap->n; 4028e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 4128e30874SSatish Balay v = aa + bs2 * (a->diag[i] + 1); 4228e30874SSatish Balay vi = aj + a->diag[i] + 1; 4328e30874SSatish Balay nz = ai[i + 1] - a->diag[i] - 1; 449566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(ls, t + i * bs, bs)); 4528e30874SSatish Balay while (nz--) { 4628e30874SSatish Balay PetscKernel_v_gets_v_minus_A_times_w(bs, ls, v, t + bs * (*vi++)); 4728e30874SSatish Balay v += bs2; 4828e30874SSatish Balay } 4928e30874SSatish Balay PetscKernel_w_gets_A_times_v(bs, ls, aa + bs2 * a->diag[i], t + i * bs); 509566063dSJacob Faibussowitsch PetscCall(PetscArraycpy(x + bs * (*c--), t + i * bs, bs)); 5128e30874SSatish Balay } 5228e30874SSatish Balay 539566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 549566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 559566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 569566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 579566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * (a->bs2) * (a->nz) - A->rmap->bs * A->cmap->n)); 5828e30874SSatish Balay PetscFunctionReturn(0); 5928e30874SSatish Balay } 6028e30874SSatish Balay 61*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A, Vec bb, Vec xx) { 6228e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6328e30874SSatish Balay IS iscol = a->col, isrow = a->row; 6428e30874SSatish Balay const PetscInt *r, *c, *ai = a->i, *aj = a->j; 6528e30874SSatish Balay const PetscInt *rout, *cout, *diag = a->diag, *vi, n = a->mbs; 6628e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 6728e30874SSatish Balay const MatScalar *aa = a->a, *v; 6828e30874SSatish Balay PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 6928e30874SSatish Balay const PetscScalar *b; 7028e30874SSatish Balay 7128e30874SSatish Balay PetscFunctionBegin; 729566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 739566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 7428e30874SSatish Balay t = a->solve_work; 7528e30874SSatish Balay 76*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 77*9371c9d4SSatish Balay r = rout; 78*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 79*9371c9d4SSatish Balay c = cout + (n - 1); 8028e30874SSatish Balay 8128e30874SSatish Balay /* forward solve the lower triangular */ 8228e30874SSatish Balay idx = 7 * (*r++); 83*9371c9d4SSatish Balay t[0] = b[idx]; 84*9371c9d4SSatish Balay t[1] = b[1 + idx]; 85*9371c9d4SSatish Balay t[2] = b[2 + idx]; 86*9371c9d4SSatish Balay t[3] = b[3 + idx]; 87*9371c9d4SSatish Balay t[4] = b[4 + idx]; 88*9371c9d4SSatish Balay t[5] = b[5 + idx]; 89*9371c9d4SSatish Balay t[6] = b[6 + idx]; 9028e30874SSatish Balay 9128e30874SSatish Balay for (i = 1; i < n; i++) { 9228e30874SSatish Balay v = aa + 49 * ai[i]; 9328e30874SSatish Balay vi = aj + ai[i]; 9428e30874SSatish Balay nz = diag[i] - ai[i]; 9528e30874SSatish Balay idx = 7 * (*r++); 96*9371c9d4SSatish Balay s1 = b[idx]; 97*9371c9d4SSatish Balay s2 = b[1 + idx]; 98*9371c9d4SSatish Balay s3 = b[2 + idx]; 99*9371c9d4SSatish Balay s4 = b[3 + idx]; 100*9371c9d4SSatish Balay s5 = b[4 + idx]; 101*9371c9d4SSatish Balay s6 = b[5 + idx]; 102*9371c9d4SSatish Balay s7 = b[6 + idx]; 10328e30874SSatish Balay while (nz--) { 10428e30874SSatish Balay idx = 7 * (*vi++); 105*9371c9d4SSatish Balay x1 = t[idx]; 106*9371c9d4SSatish Balay x2 = t[1 + idx]; 107*9371c9d4SSatish Balay x3 = t[2 + idx]; 108*9371c9d4SSatish Balay x4 = t[3 + idx]; 109*9371c9d4SSatish Balay x5 = t[4 + idx]; 110*9371c9d4SSatish Balay x6 = t[5 + idx]; 111*9371c9d4SSatish Balay x7 = t[6 + idx]; 11228e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 11328e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 11428e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 11528e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 11628e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 11728e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 11828e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 11928e30874SSatish Balay v += 49; 12028e30874SSatish Balay } 12128e30874SSatish Balay idx = 7 * i; 122*9371c9d4SSatish Balay t[idx] = s1; 123*9371c9d4SSatish Balay t[1 + idx] = s2; 124*9371c9d4SSatish Balay t[2 + idx] = s3; 125*9371c9d4SSatish Balay t[3 + idx] = s4; 126*9371c9d4SSatish Balay t[4 + idx] = s5; 127*9371c9d4SSatish Balay t[5 + idx] = s6; 128*9371c9d4SSatish Balay t[6 + idx] = s7; 12928e30874SSatish Balay } 13028e30874SSatish Balay /* backward solve the upper triangular */ 13128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 13228e30874SSatish Balay v = aa + 49 * diag[i] + 49; 13328e30874SSatish Balay vi = aj + diag[i] + 1; 13428e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 13528e30874SSatish Balay idt = 7 * i; 136*9371c9d4SSatish Balay s1 = t[idt]; 137*9371c9d4SSatish Balay s2 = t[1 + idt]; 138*9371c9d4SSatish Balay s3 = t[2 + idt]; 139*9371c9d4SSatish Balay s4 = t[3 + idt]; 140*9371c9d4SSatish Balay s5 = t[4 + idt]; 141*9371c9d4SSatish Balay s6 = t[5 + idt]; 142*9371c9d4SSatish Balay s7 = t[6 + idt]; 14328e30874SSatish Balay while (nz--) { 14428e30874SSatish Balay idx = 7 * (*vi++); 145*9371c9d4SSatish Balay x1 = t[idx]; 146*9371c9d4SSatish Balay x2 = t[1 + idx]; 147*9371c9d4SSatish Balay x3 = t[2 + idx]; 148*9371c9d4SSatish Balay x4 = t[3 + idx]; 149*9371c9d4SSatish Balay x5 = t[4 + idx]; 150*9371c9d4SSatish Balay x6 = t[5 + idx]; 151*9371c9d4SSatish Balay x7 = t[6 + idx]; 15228e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 15328e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 15428e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 15528e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 15628e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 15728e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 15828e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 15928e30874SSatish Balay v += 49; 16028e30874SSatish Balay } 16128e30874SSatish Balay idc = 7 * (*c--); 16228e30874SSatish Balay v = aa + 49 * diag[i]; 163*9371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 164*9371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 165*9371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 166*9371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 167*9371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 168*9371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 169*9371c9d4SSatish Balay x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 17028e30874SSatish Balay } 17128e30874SSatish Balay 1729566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 1739566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 1749566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 1759566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 1769566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 17728e30874SSatish Balay PetscFunctionReturn(0); 17828e30874SSatish Balay } 17928e30874SSatish Balay 180*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_7(Mat A, Vec bb, Vec xx) { 18128e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 18228e30874SSatish Balay IS iscol = a->col, isrow = a->row; 18328e30874SSatish Balay const PetscInt *r, *c, *ai = a->i, *aj = a->j, *adiag = a->diag; 18428e30874SSatish Balay const PetscInt n = a->mbs, *rout, *cout, *vi; 18528e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 18628e30874SSatish Balay const MatScalar *aa = a->a, *v; 18728e30874SSatish Balay PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7, *x, *t; 18828e30874SSatish Balay const PetscScalar *b; 18928e30874SSatish Balay 19028e30874SSatish Balay PetscFunctionBegin; 1919566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 1929566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 19328e30874SSatish Balay t = a->solve_work; 19428e30874SSatish Balay 195*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 196*9371c9d4SSatish Balay r = rout; 197*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 198*9371c9d4SSatish Balay c = cout; 19928e30874SSatish Balay 20028e30874SSatish Balay /* forward solve the lower triangular */ 20128e30874SSatish Balay idx = 7 * r[0]; 202*9371c9d4SSatish Balay t[0] = b[idx]; 203*9371c9d4SSatish Balay t[1] = b[1 + idx]; 204*9371c9d4SSatish Balay t[2] = b[2 + idx]; 205*9371c9d4SSatish Balay t[3] = b[3 + idx]; 206*9371c9d4SSatish Balay t[4] = b[4 + idx]; 207*9371c9d4SSatish Balay t[5] = b[5 + idx]; 208*9371c9d4SSatish Balay t[6] = b[6 + idx]; 20928e30874SSatish Balay 21028e30874SSatish Balay for (i = 1; i < n; i++) { 21128e30874SSatish Balay v = aa + 49 * ai[i]; 21228e30874SSatish Balay vi = aj + ai[i]; 21328e30874SSatish Balay nz = ai[i + 1] - ai[i]; 21428e30874SSatish Balay idx = 7 * r[i]; 215*9371c9d4SSatish Balay s1 = b[idx]; 216*9371c9d4SSatish Balay s2 = b[1 + idx]; 217*9371c9d4SSatish Balay s3 = b[2 + idx]; 218*9371c9d4SSatish Balay s4 = b[3 + idx]; 219*9371c9d4SSatish Balay s5 = b[4 + idx]; 220*9371c9d4SSatish Balay s6 = b[5 + idx]; 221*9371c9d4SSatish Balay s7 = b[6 + idx]; 22228e30874SSatish Balay for (m = 0; m < nz; m++) { 22328e30874SSatish Balay idx = 7 * vi[m]; 224*9371c9d4SSatish Balay x1 = t[idx]; 225*9371c9d4SSatish Balay x2 = t[1 + idx]; 226*9371c9d4SSatish Balay x3 = t[2 + idx]; 227*9371c9d4SSatish Balay x4 = t[3 + idx]; 228*9371c9d4SSatish Balay x5 = t[4 + idx]; 229*9371c9d4SSatish Balay x6 = t[5 + idx]; 230*9371c9d4SSatish Balay x7 = t[6 + idx]; 23128e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 23228e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 23328e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 23428e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 23528e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 23628e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 23728e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 23828e30874SSatish Balay v += 49; 23928e30874SSatish Balay } 24028e30874SSatish Balay idx = 7 * i; 241*9371c9d4SSatish Balay t[idx] = s1; 242*9371c9d4SSatish Balay t[1 + idx] = s2; 243*9371c9d4SSatish Balay t[2 + idx] = s3; 244*9371c9d4SSatish Balay t[3 + idx] = s4; 245*9371c9d4SSatish Balay t[4 + idx] = s5; 246*9371c9d4SSatish Balay t[5 + idx] = s6; 247*9371c9d4SSatish Balay t[6 + idx] = s7; 24828e30874SSatish Balay } 24928e30874SSatish Balay /* backward solve the upper triangular */ 25028e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 25128e30874SSatish Balay v = aa + 49 * (adiag[i + 1] + 1); 25228e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 25328e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 25428e30874SSatish Balay idt = 7 * i; 255*9371c9d4SSatish Balay s1 = t[idt]; 256*9371c9d4SSatish Balay s2 = t[1 + idt]; 257*9371c9d4SSatish Balay s3 = t[2 + idt]; 258*9371c9d4SSatish Balay s4 = t[3 + idt]; 259*9371c9d4SSatish Balay s5 = t[4 + idt]; 260*9371c9d4SSatish Balay s6 = t[5 + idt]; 261*9371c9d4SSatish Balay s7 = t[6 + idt]; 26228e30874SSatish Balay for (m = 0; m < nz; m++) { 26328e30874SSatish Balay idx = 7 * vi[m]; 264*9371c9d4SSatish Balay x1 = t[idx]; 265*9371c9d4SSatish Balay x2 = t[1 + idx]; 266*9371c9d4SSatish Balay x3 = t[2 + idx]; 267*9371c9d4SSatish Balay x4 = t[3 + idx]; 268*9371c9d4SSatish Balay x5 = t[4 + idx]; 269*9371c9d4SSatish Balay x6 = t[5 + idx]; 270*9371c9d4SSatish Balay x7 = t[6 + idx]; 27128e30874SSatish Balay s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 27228e30874SSatish Balay s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 27328e30874SSatish Balay s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 27428e30874SSatish Balay s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 27528e30874SSatish Balay s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 27628e30874SSatish Balay s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 27728e30874SSatish Balay s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 27828e30874SSatish Balay v += 49; 27928e30874SSatish Balay } 28028e30874SSatish Balay idc = 7 * c[i]; 281*9371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 282*9371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 283*9371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 284*9371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 285*9371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 286*9371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 287*9371c9d4SSatish Balay x[6 + idc] = t[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 28828e30874SSatish Balay } 28928e30874SSatish Balay 2909566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 2919566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 2929566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 2939566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 2949566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 49 * (a->nz) - 7.0 * A->cmap->n)); 29528e30874SSatish Balay PetscFunctionReturn(0); 29628e30874SSatish Balay } 29728e30874SSatish Balay 298*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A, Vec bb, Vec xx) { 29928e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 30028e30874SSatish Balay IS iscol = a->col, isrow = a->row; 30128e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 30228e30874SSatish Balay const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 30328e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 30428e30874SSatish Balay const MatScalar *aa = a->a, *v; 30528e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 30628e30874SSatish Balay const PetscScalar *b; 30728e30874SSatish Balay 30828e30874SSatish Balay PetscFunctionBegin; 3099566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 3109566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 31128e30874SSatish Balay t = a->solve_work; 31228e30874SSatish Balay 313*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 314*9371c9d4SSatish Balay r = rout; 315*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 316*9371c9d4SSatish Balay c = cout + (n - 1); 31728e30874SSatish Balay 31828e30874SSatish Balay /* forward solve the lower triangular */ 31928e30874SSatish Balay idx = 6 * (*r++); 320*9371c9d4SSatish Balay t[0] = b[idx]; 321*9371c9d4SSatish Balay t[1] = b[1 + idx]; 322*9371c9d4SSatish Balay t[2] = b[2 + idx]; 323*9371c9d4SSatish Balay t[3] = b[3 + idx]; 324*9371c9d4SSatish Balay t[4] = b[4 + idx]; 325*9371c9d4SSatish Balay t[5] = b[5 + idx]; 32628e30874SSatish Balay for (i = 1; i < n; i++) { 32728e30874SSatish Balay v = aa + 36 * ai[i]; 32828e30874SSatish Balay vi = aj + ai[i]; 32928e30874SSatish Balay nz = diag[i] - ai[i]; 33028e30874SSatish Balay idx = 6 * (*r++); 331*9371c9d4SSatish Balay s1 = b[idx]; 332*9371c9d4SSatish Balay s2 = b[1 + idx]; 333*9371c9d4SSatish Balay s3 = b[2 + idx]; 334*9371c9d4SSatish Balay s4 = b[3 + idx]; 335*9371c9d4SSatish Balay s5 = b[4 + idx]; 336*9371c9d4SSatish Balay s6 = b[5 + idx]; 33728e30874SSatish Balay while (nz--) { 33828e30874SSatish Balay idx = 6 * (*vi++); 339*9371c9d4SSatish Balay x1 = t[idx]; 340*9371c9d4SSatish Balay x2 = t[1 + idx]; 341*9371c9d4SSatish Balay x3 = t[2 + idx]; 342*9371c9d4SSatish Balay x4 = t[3 + idx]; 343*9371c9d4SSatish Balay x5 = t[4 + idx]; 344*9371c9d4SSatish Balay x6 = t[5 + idx]; 34528e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 34628e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 34728e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 34828e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 34928e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 35028e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 35128e30874SSatish Balay v += 36; 35228e30874SSatish Balay } 35328e30874SSatish Balay idx = 6 * i; 354*9371c9d4SSatish Balay t[idx] = s1; 355*9371c9d4SSatish Balay t[1 + idx] = s2; 356*9371c9d4SSatish Balay t[2 + idx] = s3; 357*9371c9d4SSatish Balay t[3 + idx] = s4; 358*9371c9d4SSatish Balay t[4 + idx] = s5; 359*9371c9d4SSatish Balay t[5 + idx] = s6; 36028e30874SSatish Balay } 36128e30874SSatish Balay /* backward solve the upper triangular */ 36228e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 36328e30874SSatish Balay v = aa + 36 * diag[i] + 36; 36428e30874SSatish Balay vi = aj + diag[i] + 1; 36528e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 36628e30874SSatish Balay idt = 6 * i; 367*9371c9d4SSatish Balay s1 = t[idt]; 368*9371c9d4SSatish Balay s2 = t[1 + idt]; 369*9371c9d4SSatish Balay s3 = t[2 + idt]; 370*9371c9d4SSatish Balay s4 = t[3 + idt]; 371*9371c9d4SSatish Balay s5 = t[4 + idt]; 372*9371c9d4SSatish Balay s6 = t[5 + idt]; 37328e30874SSatish Balay while (nz--) { 37428e30874SSatish Balay idx = 6 * (*vi++); 375*9371c9d4SSatish Balay x1 = t[idx]; 376*9371c9d4SSatish Balay x2 = t[1 + idx]; 377*9371c9d4SSatish Balay x3 = t[2 + idx]; 378*9371c9d4SSatish Balay x4 = t[3 + idx]; 379*9371c9d4SSatish Balay x5 = t[4 + idx]; 380*9371c9d4SSatish Balay x6 = t[5 + idx]; 38128e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 38228e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 38328e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 38428e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 38528e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 38628e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 38728e30874SSatish Balay v += 36; 38828e30874SSatish Balay } 38928e30874SSatish Balay idc = 6 * (*c--); 39028e30874SSatish Balay v = aa + 36 * diag[i]; 391*9371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 392*9371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 393*9371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 394*9371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 395*9371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 396*9371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 39728e30874SSatish Balay } 39828e30874SSatish Balay 3999566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 4009566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 4019566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 4029566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 4039566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 40428e30874SSatish Balay PetscFunctionReturn(0); 40528e30874SSatish Balay } 40628e30874SSatish Balay 407*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_6(Mat A, Vec bb, Vec xx) { 40828e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 40928e30874SSatish Balay IS iscol = a->col, isrow = a->row; 41028e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 41128e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 41228e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 41328e30874SSatish Balay const MatScalar *aa = a->a, *v; 41428e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6, *t; 41528e30874SSatish Balay const PetscScalar *b; 41628e30874SSatish Balay 41728e30874SSatish Balay PetscFunctionBegin; 4189566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 4199566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 42028e30874SSatish Balay t = a->solve_work; 42128e30874SSatish Balay 422*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 423*9371c9d4SSatish Balay r = rout; 424*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 425*9371c9d4SSatish Balay c = cout; 42628e30874SSatish Balay 42728e30874SSatish Balay /* forward solve the lower triangular */ 42828e30874SSatish Balay idx = 6 * r[0]; 429*9371c9d4SSatish Balay t[0] = b[idx]; 430*9371c9d4SSatish Balay t[1] = b[1 + idx]; 431*9371c9d4SSatish Balay t[2] = b[2 + idx]; 432*9371c9d4SSatish Balay t[3] = b[3 + idx]; 433*9371c9d4SSatish Balay t[4] = b[4 + idx]; 434*9371c9d4SSatish Balay t[5] = b[5 + idx]; 43528e30874SSatish Balay for (i = 1; i < n; i++) { 43628e30874SSatish Balay v = aa + 36 * ai[i]; 43728e30874SSatish Balay vi = aj + ai[i]; 43828e30874SSatish Balay nz = ai[i + 1] - ai[i]; 43928e30874SSatish Balay idx = 6 * r[i]; 440*9371c9d4SSatish Balay s1 = b[idx]; 441*9371c9d4SSatish Balay s2 = b[1 + idx]; 442*9371c9d4SSatish Balay s3 = b[2 + idx]; 443*9371c9d4SSatish Balay s4 = b[3 + idx]; 444*9371c9d4SSatish Balay s5 = b[4 + idx]; 445*9371c9d4SSatish Balay s6 = b[5 + idx]; 44628e30874SSatish Balay for (m = 0; m < nz; m++) { 44728e30874SSatish Balay idx = 6 * vi[m]; 448*9371c9d4SSatish Balay x1 = t[idx]; 449*9371c9d4SSatish Balay x2 = t[1 + idx]; 450*9371c9d4SSatish Balay x3 = t[2 + idx]; 451*9371c9d4SSatish Balay x4 = t[3 + idx]; 452*9371c9d4SSatish Balay x5 = t[4 + idx]; 453*9371c9d4SSatish Balay x6 = t[5 + idx]; 45428e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 45528e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 45628e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 45728e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 45828e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 45928e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 46028e30874SSatish Balay v += 36; 46128e30874SSatish Balay } 46228e30874SSatish Balay idx = 6 * i; 463*9371c9d4SSatish Balay t[idx] = s1; 464*9371c9d4SSatish Balay t[1 + idx] = s2; 465*9371c9d4SSatish Balay t[2 + idx] = s3; 466*9371c9d4SSatish Balay t[3 + idx] = s4; 467*9371c9d4SSatish Balay t[4 + idx] = s5; 468*9371c9d4SSatish Balay t[5 + idx] = s6; 46928e30874SSatish Balay } 47028e30874SSatish Balay /* backward solve the upper triangular */ 47128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 47228e30874SSatish Balay v = aa + 36 * (adiag[i + 1] + 1); 47328e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 47428e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 47528e30874SSatish Balay idt = 6 * i; 476*9371c9d4SSatish Balay s1 = t[idt]; 477*9371c9d4SSatish Balay s2 = t[1 + idt]; 478*9371c9d4SSatish Balay s3 = t[2 + idt]; 479*9371c9d4SSatish Balay s4 = t[3 + idt]; 480*9371c9d4SSatish Balay s5 = t[4 + idt]; 481*9371c9d4SSatish Balay s6 = t[5 + idt]; 48228e30874SSatish Balay for (m = 0; m < nz; m++) { 48328e30874SSatish Balay idx = 6 * vi[m]; 484*9371c9d4SSatish Balay x1 = t[idx]; 485*9371c9d4SSatish Balay x2 = t[1 + idx]; 486*9371c9d4SSatish Balay x3 = t[2 + idx]; 487*9371c9d4SSatish Balay x4 = t[3 + idx]; 488*9371c9d4SSatish Balay x5 = t[4 + idx]; 489*9371c9d4SSatish Balay x6 = t[5 + idx]; 49028e30874SSatish Balay s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 49128e30874SSatish Balay s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 49228e30874SSatish Balay s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 49328e30874SSatish Balay s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 49428e30874SSatish Balay s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 49528e30874SSatish Balay s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 49628e30874SSatish Balay v += 36; 49728e30874SSatish Balay } 49828e30874SSatish Balay idc = 6 * c[i]; 499*9371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 500*9371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 501*9371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 502*9371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 503*9371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 504*9371c9d4SSatish Balay x[5 + idc] = t[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 50528e30874SSatish Balay } 50628e30874SSatish Balay 5079566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 5089566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 5099566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 5109566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 5119566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 51228e30874SSatish Balay PetscFunctionReturn(0); 51328e30874SSatish Balay } 51428e30874SSatish Balay 515*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A, Vec bb, Vec xx) { 51628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 51728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 51828e30874SSatish Balay const PetscInt *r, *c, *rout, *cout, *diag = a->diag; 51928e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 52028e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 52128e30874SSatish Balay const MatScalar *aa = a->a, *v; 52228e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 52328e30874SSatish Balay const PetscScalar *b; 52428e30874SSatish Balay 52528e30874SSatish Balay PetscFunctionBegin; 5269566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 5279566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 52828e30874SSatish Balay t = a->solve_work; 52928e30874SSatish Balay 530*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 531*9371c9d4SSatish Balay r = rout; 532*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 533*9371c9d4SSatish Balay c = cout + (n - 1); 53428e30874SSatish Balay 53528e30874SSatish Balay /* forward solve the lower triangular */ 53628e30874SSatish Balay idx = 5 * (*r++); 537*9371c9d4SSatish Balay t[0] = b[idx]; 538*9371c9d4SSatish Balay t[1] = b[1 + idx]; 539*9371c9d4SSatish Balay t[2] = b[2 + idx]; 540*9371c9d4SSatish Balay t[3] = b[3 + idx]; 541*9371c9d4SSatish Balay t[4] = b[4 + idx]; 54228e30874SSatish Balay for (i = 1; i < n; i++) { 54328e30874SSatish Balay v = aa + 25 * ai[i]; 54428e30874SSatish Balay vi = aj + ai[i]; 54528e30874SSatish Balay nz = diag[i] - ai[i]; 54628e30874SSatish Balay idx = 5 * (*r++); 547*9371c9d4SSatish Balay s1 = b[idx]; 548*9371c9d4SSatish Balay s2 = b[1 + idx]; 549*9371c9d4SSatish Balay s3 = b[2 + idx]; 550*9371c9d4SSatish Balay s4 = b[3 + idx]; 55128e30874SSatish Balay s5 = b[4 + idx]; 55228e30874SSatish Balay while (nz--) { 55328e30874SSatish Balay idx = 5 * (*vi++); 554*9371c9d4SSatish Balay x1 = t[idx]; 555*9371c9d4SSatish Balay x2 = t[1 + idx]; 556*9371c9d4SSatish Balay x3 = t[2 + idx]; 557*9371c9d4SSatish Balay x4 = t[3 + idx]; 558*9371c9d4SSatish Balay x5 = t[4 + idx]; 55928e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 56028e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 56128e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 56228e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 56328e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 56428e30874SSatish Balay v += 25; 56528e30874SSatish Balay } 56628e30874SSatish Balay idx = 5 * i; 567*9371c9d4SSatish Balay t[idx] = s1; 568*9371c9d4SSatish Balay t[1 + idx] = s2; 569*9371c9d4SSatish Balay t[2 + idx] = s3; 570*9371c9d4SSatish Balay t[3 + idx] = s4; 571*9371c9d4SSatish Balay t[4 + idx] = s5; 57228e30874SSatish Balay } 57328e30874SSatish Balay /* backward solve the upper triangular */ 57428e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 57528e30874SSatish Balay v = aa + 25 * diag[i] + 25; 57628e30874SSatish Balay vi = aj + diag[i] + 1; 57728e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 57828e30874SSatish Balay idt = 5 * i; 579*9371c9d4SSatish Balay s1 = t[idt]; 580*9371c9d4SSatish Balay s2 = t[1 + idt]; 581*9371c9d4SSatish Balay s3 = t[2 + idt]; 582*9371c9d4SSatish Balay s4 = t[3 + idt]; 583*9371c9d4SSatish Balay s5 = t[4 + idt]; 58428e30874SSatish Balay while (nz--) { 58528e30874SSatish Balay idx = 5 * (*vi++); 586*9371c9d4SSatish Balay x1 = t[idx]; 587*9371c9d4SSatish Balay x2 = t[1 + idx]; 588*9371c9d4SSatish Balay x3 = t[2 + idx]; 589*9371c9d4SSatish Balay x4 = t[3 + idx]; 590*9371c9d4SSatish Balay x5 = t[4 + idx]; 59128e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 59228e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 59328e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 59428e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 59528e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 59628e30874SSatish Balay v += 25; 59728e30874SSatish Balay } 59828e30874SSatish Balay idc = 5 * (*c--); 59928e30874SSatish Balay v = aa + 25 * diag[i]; 600*9371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 601*9371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 602*9371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 603*9371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 604*9371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 60528e30874SSatish Balay } 60628e30874SSatish Balay 6079566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 6089566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 6099566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 6109566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 6119566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 61228e30874SSatish Balay PetscFunctionReturn(0); 61328e30874SSatish Balay } 61428e30874SSatish Balay 615*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_5(Mat A, Vec bb, Vec xx) { 61628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 61728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 61828e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 61928e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 62028e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 62128e30874SSatish Balay const MatScalar *aa = a->a, *v; 62228e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, s5, x1, x2, x3, x4, x5, *t; 62328e30874SSatish Balay const PetscScalar *b; 62428e30874SSatish Balay 62528e30874SSatish Balay PetscFunctionBegin; 6269566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 6279566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 62828e30874SSatish Balay t = a->solve_work; 62928e30874SSatish Balay 630*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 631*9371c9d4SSatish Balay r = rout; 632*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 633*9371c9d4SSatish Balay c = cout; 63428e30874SSatish Balay 63528e30874SSatish Balay /* forward solve the lower triangular */ 63628e30874SSatish Balay idx = 5 * r[0]; 637*9371c9d4SSatish Balay t[0] = b[idx]; 638*9371c9d4SSatish Balay t[1] = b[1 + idx]; 639*9371c9d4SSatish Balay t[2] = b[2 + idx]; 640*9371c9d4SSatish Balay t[3] = b[3 + idx]; 641*9371c9d4SSatish Balay t[4] = b[4 + idx]; 64228e30874SSatish Balay for (i = 1; i < n; i++) { 64328e30874SSatish Balay v = aa + 25 * ai[i]; 64428e30874SSatish Balay vi = aj + ai[i]; 64528e30874SSatish Balay nz = ai[i + 1] - ai[i]; 64628e30874SSatish Balay idx = 5 * r[i]; 647*9371c9d4SSatish Balay s1 = b[idx]; 648*9371c9d4SSatish Balay s2 = b[1 + idx]; 649*9371c9d4SSatish Balay s3 = b[2 + idx]; 650*9371c9d4SSatish Balay s4 = b[3 + idx]; 65128e30874SSatish Balay s5 = b[4 + idx]; 65228e30874SSatish Balay for (m = 0; m < nz; m++) { 65328e30874SSatish Balay idx = 5 * vi[m]; 654*9371c9d4SSatish Balay x1 = t[idx]; 655*9371c9d4SSatish Balay x2 = t[1 + idx]; 656*9371c9d4SSatish Balay x3 = t[2 + idx]; 657*9371c9d4SSatish Balay x4 = t[3 + idx]; 658*9371c9d4SSatish Balay x5 = t[4 + idx]; 65928e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 66028e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 66128e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 66228e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 66328e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 66428e30874SSatish Balay v += 25; 66528e30874SSatish Balay } 66628e30874SSatish Balay idx = 5 * i; 667*9371c9d4SSatish Balay t[idx] = s1; 668*9371c9d4SSatish Balay t[1 + idx] = s2; 669*9371c9d4SSatish Balay t[2 + idx] = s3; 670*9371c9d4SSatish Balay t[3 + idx] = s4; 671*9371c9d4SSatish Balay t[4 + idx] = s5; 67228e30874SSatish Balay } 67328e30874SSatish Balay /* backward solve the upper triangular */ 67428e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 67528e30874SSatish Balay v = aa + 25 * (adiag[i + 1] + 1); 67628e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 67728e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 67828e30874SSatish Balay idt = 5 * i; 679*9371c9d4SSatish Balay s1 = t[idt]; 680*9371c9d4SSatish Balay s2 = t[1 + idt]; 681*9371c9d4SSatish Balay s3 = t[2 + idt]; 682*9371c9d4SSatish Balay s4 = t[3 + idt]; 683*9371c9d4SSatish Balay s5 = t[4 + idt]; 68428e30874SSatish Balay for (m = 0; m < nz; m++) { 68528e30874SSatish Balay idx = 5 * vi[m]; 686*9371c9d4SSatish Balay x1 = t[idx]; 687*9371c9d4SSatish Balay x2 = t[1 + idx]; 688*9371c9d4SSatish Balay x3 = t[2 + idx]; 689*9371c9d4SSatish Balay x4 = t[3 + idx]; 690*9371c9d4SSatish Balay x5 = t[4 + idx]; 69128e30874SSatish Balay s1 -= v[0] * x1 + v[5] * x2 + v[10] * x3 + v[15] * x4 + v[20] * x5; 69228e30874SSatish Balay s2 -= v[1] * x1 + v[6] * x2 + v[11] * x3 + v[16] * x4 + v[21] * x5; 69328e30874SSatish Balay s3 -= v[2] * x1 + v[7] * x2 + v[12] * x3 + v[17] * x4 + v[22] * x5; 69428e30874SSatish Balay s4 -= v[3] * x1 + v[8] * x2 + v[13] * x3 + v[18] * x4 + v[23] * x5; 69528e30874SSatish Balay s5 -= v[4] * x1 + v[9] * x2 + v[14] * x3 + v[19] * x4 + v[24] * x5; 69628e30874SSatish Balay v += 25; 69728e30874SSatish Balay } 69828e30874SSatish Balay idc = 5 * c[i]; 699*9371c9d4SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[5] * s2 + v[10] * s3 + v[15] * s4 + v[20] * s5; 700*9371c9d4SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[6] * s2 + v[11] * s3 + v[16] * s4 + v[21] * s5; 701*9371c9d4SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[7] * s2 + v[12] * s3 + v[17] * s4 + v[22] * s5; 702*9371c9d4SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[8] * s2 + v[13] * s3 + v[18] * s4 + v[23] * s5; 703*9371c9d4SSatish Balay x[4 + idc] = t[4 + idt] = v[4] * s1 + v[9] * s2 + v[14] * s3 + v[19] * s4 + v[24] * s5; 70428e30874SSatish Balay } 70528e30874SSatish Balay 7069566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 7079566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 7089566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 7099566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 7109566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 25 * (a->nz) - 5.0 * A->cmap->n)); 71128e30874SSatish Balay PetscFunctionReturn(0); 71228e30874SSatish Balay } 71328e30874SSatish Balay 714*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A, Vec bb, Vec xx) { 71528e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 71628e30874SSatish Balay IS iscol = a->col, isrow = a->row; 71728e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 71828e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 71928e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 72028e30874SSatish Balay const MatScalar *aa = a->a, *v; 72128e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 72228e30874SSatish Balay const PetscScalar *b; 72328e30874SSatish Balay 72428e30874SSatish Balay PetscFunctionBegin; 7259566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 7269566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 72728e30874SSatish Balay t = a->solve_work; 72828e30874SSatish Balay 729*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 730*9371c9d4SSatish Balay r = rout; 731*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 732*9371c9d4SSatish Balay c = cout + (n - 1); 73328e30874SSatish Balay 73428e30874SSatish Balay /* forward solve the lower triangular */ 73528e30874SSatish Balay idx = 4 * (*r++); 736*9371c9d4SSatish Balay t[0] = b[idx]; 737*9371c9d4SSatish Balay t[1] = b[1 + idx]; 738*9371c9d4SSatish Balay t[2] = b[2 + idx]; 739*9371c9d4SSatish Balay t[3] = b[3 + idx]; 74028e30874SSatish Balay for (i = 1; i < n; i++) { 74128e30874SSatish Balay v = aa + 16 * ai[i]; 74228e30874SSatish Balay vi = aj + ai[i]; 74328e30874SSatish Balay nz = diag[i] - ai[i]; 74428e30874SSatish Balay idx = 4 * (*r++); 745*9371c9d4SSatish Balay s1 = b[idx]; 746*9371c9d4SSatish Balay s2 = b[1 + idx]; 747*9371c9d4SSatish Balay s3 = b[2 + idx]; 748*9371c9d4SSatish Balay s4 = b[3 + idx]; 74928e30874SSatish Balay while (nz--) { 75028e30874SSatish Balay idx = 4 * (*vi++); 751*9371c9d4SSatish Balay x1 = t[idx]; 752*9371c9d4SSatish Balay x2 = t[1 + idx]; 753*9371c9d4SSatish Balay x3 = t[2 + idx]; 754*9371c9d4SSatish Balay x4 = t[3 + idx]; 75528e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 75628e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 75728e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 75828e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 75928e30874SSatish Balay v += 16; 76028e30874SSatish Balay } 76128e30874SSatish Balay idx = 4 * i; 762*9371c9d4SSatish Balay t[idx] = s1; 763*9371c9d4SSatish Balay t[1 + idx] = s2; 764*9371c9d4SSatish Balay t[2 + idx] = s3; 765*9371c9d4SSatish Balay t[3 + idx] = s4; 76628e30874SSatish Balay } 76728e30874SSatish Balay /* backward solve the upper triangular */ 76828e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 76928e30874SSatish Balay v = aa + 16 * diag[i] + 16; 77028e30874SSatish Balay vi = aj + diag[i] + 1; 77128e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 77228e30874SSatish Balay idt = 4 * i; 773*9371c9d4SSatish Balay s1 = t[idt]; 774*9371c9d4SSatish Balay s2 = t[1 + idt]; 775*9371c9d4SSatish Balay s3 = t[2 + idt]; 776*9371c9d4SSatish Balay s4 = t[3 + idt]; 77728e30874SSatish Balay while (nz--) { 77828e30874SSatish Balay idx = 4 * (*vi++); 779*9371c9d4SSatish Balay x1 = t[idx]; 780*9371c9d4SSatish Balay x2 = t[1 + idx]; 781*9371c9d4SSatish Balay x3 = t[2 + idx]; 782*9371c9d4SSatish Balay x4 = t[3 + idx]; 78328e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 78428e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 78528e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 78628e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 78728e30874SSatish Balay v += 16; 78828e30874SSatish Balay } 78928e30874SSatish Balay idc = 4 * (*c--); 79028e30874SSatish Balay v = aa + 16 * diag[i]; 79128e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 79228e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 79328e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 79428e30874SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 79528e30874SSatish Balay } 79628e30874SSatish Balay 7979566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 7989566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 7999566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 8009566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 8019566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 80228e30874SSatish Balay PetscFunctionReturn(0); 80328e30874SSatish Balay } 80428e30874SSatish Balay 805*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4(Mat A, Vec bb, Vec xx) { 80628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 80728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 80828e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 80928e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 81028e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 81128e30874SSatish Balay const MatScalar *aa = a->a, *v; 81228e30874SSatish Balay PetscScalar *x, s1, s2, s3, s4, x1, x2, x3, x4, *t; 81328e30874SSatish Balay const PetscScalar *b; 81428e30874SSatish Balay 81528e30874SSatish Balay PetscFunctionBegin; 8169566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 8179566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 81828e30874SSatish Balay t = a->solve_work; 81928e30874SSatish Balay 820*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 821*9371c9d4SSatish Balay r = rout; 822*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 823*9371c9d4SSatish Balay c = cout; 82428e30874SSatish Balay 82528e30874SSatish Balay /* forward solve the lower triangular */ 82628e30874SSatish Balay idx = 4 * r[0]; 827*9371c9d4SSatish Balay t[0] = b[idx]; 828*9371c9d4SSatish Balay t[1] = b[1 + idx]; 829*9371c9d4SSatish Balay t[2] = b[2 + idx]; 830*9371c9d4SSatish Balay t[3] = b[3 + idx]; 83128e30874SSatish Balay for (i = 1; i < n; i++) { 83228e30874SSatish Balay v = aa + 16 * ai[i]; 83328e30874SSatish Balay vi = aj + ai[i]; 83428e30874SSatish Balay nz = ai[i + 1] - ai[i]; 83528e30874SSatish Balay idx = 4 * r[i]; 836*9371c9d4SSatish Balay s1 = b[idx]; 837*9371c9d4SSatish Balay s2 = b[1 + idx]; 838*9371c9d4SSatish Balay s3 = b[2 + idx]; 839*9371c9d4SSatish Balay s4 = b[3 + idx]; 84028e30874SSatish Balay for (m = 0; m < nz; m++) { 84128e30874SSatish Balay idx = 4 * vi[m]; 842*9371c9d4SSatish Balay x1 = t[idx]; 843*9371c9d4SSatish Balay x2 = t[1 + idx]; 844*9371c9d4SSatish Balay x3 = t[2 + idx]; 845*9371c9d4SSatish Balay x4 = t[3 + idx]; 84628e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 84728e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 84828e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 84928e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 85028e30874SSatish Balay v += 16; 85128e30874SSatish Balay } 85228e30874SSatish Balay idx = 4 * i; 853*9371c9d4SSatish Balay t[idx] = s1; 854*9371c9d4SSatish Balay t[1 + idx] = s2; 855*9371c9d4SSatish Balay t[2 + idx] = s3; 856*9371c9d4SSatish Balay t[3 + idx] = s4; 85728e30874SSatish Balay } 85828e30874SSatish Balay /* backward solve the upper triangular */ 85928e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 86028e30874SSatish Balay v = aa + 16 * (adiag[i + 1] + 1); 86128e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 86228e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 86328e30874SSatish Balay idt = 4 * i; 864*9371c9d4SSatish Balay s1 = t[idt]; 865*9371c9d4SSatish Balay s2 = t[1 + idt]; 866*9371c9d4SSatish Balay s3 = t[2 + idt]; 867*9371c9d4SSatish Balay s4 = t[3 + idt]; 86828e30874SSatish Balay for (m = 0; m < nz; m++) { 86928e30874SSatish Balay idx = 4 * vi[m]; 870*9371c9d4SSatish Balay x1 = t[idx]; 871*9371c9d4SSatish Balay x2 = t[1 + idx]; 872*9371c9d4SSatish Balay x3 = t[2 + idx]; 873*9371c9d4SSatish Balay x4 = t[3 + idx]; 87428e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 87528e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 87628e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 87728e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 87828e30874SSatish Balay v += 16; 87928e30874SSatish Balay } 88028e30874SSatish Balay idc = 4 * c[i]; 88128e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 88228e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 88328e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 88428e30874SSatish Balay x[3 + idc] = t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 88528e30874SSatish Balay } 88628e30874SSatish Balay 8879566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 8889566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 8899566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 8909566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 8919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 89228e30874SSatish Balay PetscFunctionReturn(0); 89328e30874SSatish Balay } 89428e30874SSatish Balay 895*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A, Vec bb, Vec xx) { 89628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 89728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 89828e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 89928e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 90028e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 90128e30874SSatish Balay const MatScalar *aa = a->a, *v; 90228e30874SSatish Balay MatScalar s1, s2, s3, s4, x1, x2, x3, x4, *t; 90328e30874SSatish Balay PetscScalar *x; 90428e30874SSatish Balay const PetscScalar *b; 90528e30874SSatish Balay 90628e30874SSatish Balay PetscFunctionBegin; 9079566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 9089566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 90928e30874SSatish Balay t = (MatScalar *)a->solve_work; 91028e30874SSatish Balay 911*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 912*9371c9d4SSatish Balay r = rout; 913*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 914*9371c9d4SSatish Balay c = cout + (n - 1); 91528e30874SSatish Balay 91628e30874SSatish Balay /* forward solve the lower triangular */ 91728e30874SSatish Balay idx = 4 * (*r++); 91828e30874SSatish Balay t[0] = (MatScalar)b[idx]; 91928e30874SSatish Balay t[1] = (MatScalar)b[1 + idx]; 92028e30874SSatish Balay t[2] = (MatScalar)b[2 + idx]; 92128e30874SSatish Balay t[3] = (MatScalar)b[3 + idx]; 92228e30874SSatish Balay for (i = 1; i < n; i++) { 92328e30874SSatish Balay v = aa + 16 * ai[i]; 92428e30874SSatish Balay vi = aj + ai[i]; 92528e30874SSatish Balay nz = diag[i] - ai[i]; 92628e30874SSatish Balay idx = 4 * (*r++); 92728e30874SSatish Balay s1 = (MatScalar)b[idx]; 92828e30874SSatish Balay s2 = (MatScalar)b[1 + idx]; 92928e30874SSatish Balay s3 = (MatScalar)b[2 + idx]; 93028e30874SSatish Balay s4 = (MatScalar)b[3 + idx]; 93128e30874SSatish Balay while (nz--) { 93228e30874SSatish Balay idx = 4 * (*vi++); 93328e30874SSatish Balay x1 = t[idx]; 93428e30874SSatish Balay x2 = t[1 + idx]; 93528e30874SSatish Balay x3 = t[2 + idx]; 93628e30874SSatish Balay x4 = t[3 + idx]; 93728e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 93828e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 93928e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 94028e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 94128e30874SSatish Balay v += 16; 94228e30874SSatish Balay } 94328e30874SSatish Balay idx = 4 * i; 94428e30874SSatish Balay t[idx] = s1; 94528e30874SSatish Balay t[1 + idx] = s2; 94628e30874SSatish Balay t[2 + idx] = s3; 94728e30874SSatish Balay t[3 + idx] = s4; 94828e30874SSatish Balay } 94928e30874SSatish Balay /* backward solve the upper triangular */ 95028e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 95128e30874SSatish Balay v = aa + 16 * diag[i] + 16; 95228e30874SSatish Balay vi = aj + diag[i] + 1; 95328e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 95428e30874SSatish Balay idt = 4 * i; 95528e30874SSatish Balay s1 = t[idt]; 95628e30874SSatish Balay s2 = t[1 + idt]; 95728e30874SSatish Balay s3 = t[2 + idt]; 95828e30874SSatish Balay s4 = t[3 + idt]; 95928e30874SSatish Balay while (nz--) { 96028e30874SSatish Balay idx = 4 * (*vi++); 96128e30874SSatish Balay x1 = t[idx]; 96228e30874SSatish Balay x2 = t[1 + idx]; 96328e30874SSatish Balay x3 = t[2 + idx]; 96428e30874SSatish Balay x4 = t[3 + idx]; 96528e30874SSatish Balay s1 -= v[0] * x1 + v[4] * x2 + v[8] * x3 + v[12] * x4; 96628e30874SSatish Balay s2 -= v[1] * x1 + v[5] * x2 + v[9] * x3 + v[13] * x4; 96728e30874SSatish Balay s3 -= v[2] * x1 + v[6] * x2 + v[10] * x3 + v[14] * x4; 96828e30874SSatish Balay s4 -= v[3] * x1 + v[7] * x2 + v[11] * x3 + v[15] * x4; 96928e30874SSatish Balay v += 16; 97028e30874SSatish Balay } 97128e30874SSatish Balay idc = 4 * (*c--); 97228e30874SSatish Balay v = aa + 16 * diag[i]; 97328e30874SSatish Balay t[idt] = v[0] * s1 + v[4] * s2 + v[8] * s3 + v[12] * s4; 97428e30874SSatish Balay t[1 + idt] = v[1] * s1 + v[5] * s2 + v[9] * s3 + v[13] * s4; 97528e30874SSatish Balay t[2 + idt] = v[2] * s1 + v[6] * s2 + v[10] * s3 + v[14] * s4; 97628e30874SSatish Balay t[3 + idt] = v[3] * s1 + v[7] * s2 + v[11] * s3 + v[15] * s4; 97728e30874SSatish Balay x[idc] = (PetscScalar)t[idt]; 97828e30874SSatish Balay x[1 + idc] = (PetscScalar)t[1 + idt]; 97928e30874SSatish Balay x[2 + idc] = (PetscScalar)t[2 + idt]; 98028e30874SSatish Balay x[3 + idc] = (PetscScalar)t[3 + idt]; 98128e30874SSatish Balay } 98228e30874SSatish Balay 9839566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 9849566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 9859566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 9869566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 9879566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 98828e30874SSatish Balay PetscFunctionReturn(0); 98928e30874SSatish Balay } 99028e30874SSatish Balay 99128e30874SSatish Balay #if defined(PETSC_HAVE_SSE) 99228e30874SSatish Balay 99328e30874SSatish Balay #include PETSC_HAVE_SSE 99428e30874SSatish Balay 995*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A, Vec bb, Vec xx) { 99628e30874SSatish Balay /* 99728e30874SSatish Balay Note: This code uses demotion of double 99828e30874SSatish Balay to float when performing the mixed-mode computation. 99928e30874SSatish Balay This may not be numerically reasonable for all applications. 100028e30874SSatish Balay */ 100128e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 100228e30874SSatish Balay IS iscol = a->col, isrow = a->row; 100328e30874SSatish Balay PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, nz, idx, idt, idc, ai16; 100428e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 100528e30874SSatish Balay MatScalar *aa = a->a, *v; 100628e30874SSatish Balay PetscScalar *x, *b, *t; 100728e30874SSatish Balay 100828e30874SSatish Balay /* Make space in temp stack for 16 Byte Aligned arrays */ 100928e30874SSatish Balay float ssealignedspace[11], *tmps, *tmpx; 101028e30874SSatish Balay unsigned long offset; 101128e30874SSatish Balay 101228e30874SSatish Balay PetscFunctionBegin; 101328e30874SSatish Balay SSE_SCOPE_BEGIN; 101428e30874SSatish Balay 101528e30874SSatish Balay offset = (unsigned long)ssealignedspace % 16; 101628e30874SSatish Balay if (offset) offset = (16 - offset) / 4; 101728e30874SSatish Balay tmps = &ssealignedspace[offset]; 101828e30874SSatish Balay tmpx = &ssealignedspace[offset + 4]; 101928e30874SSatish Balay PREFETCH_NTA(aa + 16 * ai[1]); 102028e30874SSatish Balay 10219566063dSJacob Faibussowitsch PetscCall(VecGetArray(bb, &b)); 10229566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 102328e30874SSatish Balay t = a->solve_work; 102428e30874SSatish Balay 1025*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 1026*9371c9d4SSatish Balay r = rout; 1027*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 1028*9371c9d4SSatish Balay c = cout + (n - 1); 102928e30874SSatish Balay 103028e30874SSatish Balay /* forward solve the lower triangular */ 103128e30874SSatish Balay idx = 4 * (*r++); 1032*9371c9d4SSatish Balay t[0] = b[idx]; 1033*9371c9d4SSatish Balay t[1] = b[1 + idx]; 1034*9371c9d4SSatish Balay t[2] = b[2 + idx]; 1035*9371c9d4SSatish Balay t[3] = b[3 + idx]; 103628e30874SSatish Balay v = aa + 16 * ai[1]; 103728e30874SSatish Balay 103828e30874SSatish Balay for (i = 1; i < n;) { 103928e30874SSatish Balay PREFETCH_NTA(&v[8]); 104028e30874SSatish Balay vi = aj + ai[i]; 104128e30874SSatish Balay nz = diag[i] - ai[i]; 104228e30874SSatish Balay idx = 4 * (*r++); 104328e30874SSatish Balay 104428e30874SSatish Balay /* Demote sum from double to float */ 104528e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmps, &b[idx]); 104628e30874SSatish Balay LOAD_PS(tmps, XMM7); 104728e30874SSatish Balay 104828e30874SSatish Balay while (nz--) { 104928e30874SSatish Balay PREFETCH_NTA(&v[16]); 105028e30874SSatish Balay idx = 4 * (*vi++); 105128e30874SSatish Balay 105228e30874SSatish Balay /* Demote solution (so far) from double to float */ 105328e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmpx, &x[idx]); 105428e30874SSatish Balay 105528e30874SSatish Balay /* 4x4 Matrix-Vector product with negative accumulation: */ 105628e30874SSatish Balay SSE_INLINE_BEGIN_2(tmpx, v) 105728e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 105828e30874SSatish Balay 105928e30874SSatish Balay /* First Column */ 106028e30874SSatish Balay SSE_COPY_PS(XMM0, XMM6) 106128e30874SSatish Balay SSE_SHUFFLE(XMM0, XMM0, 0x00) 106228e30874SSatish Balay SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 106328e30874SSatish Balay SSE_SUB_PS(XMM7, XMM0) 106428e30874SSatish Balay 106528e30874SSatish Balay /* Second Column */ 106628e30874SSatish Balay SSE_COPY_PS(XMM1, XMM6) 106728e30874SSatish Balay SSE_SHUFFLE(XMM1, XMM1, 0x55) 106828e30874SSatish Balay SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 106928e30874SSatish Balay SSE_SUB_PS(XMM7, XMM1) 107028e30874SSatish Balay 107128e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 107228e30874SSatish Balay 107328e30874SSatish Balay /* Third Column */ 107428e30874SSatish Balay SSE_COPY_PS(XMM2, XMM6) 107528e30874SSatish Balay SSE_SHUFFLE(XMM2, XMM2, 0xAA) 107628e30874SSatish Balay SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 107728e30874SSatish Balay SSE_SUB_PS(XMM7, XMM2) 107828e30874SSatish Balay 107928e30874SSatish Balay /* Fourth Column */ 108028e30874SSatish Balay SSE_COPY_PS(XMM3, XMM6) 108128e30874SSatish Balay SSE_SHUFFLE(XMM3, XMM3, 0xFF) 108228e30874SSatish Balay SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 108328e30874SSatish Balay SSE_SUB_PS(XMM7, XMM3) 108428e30874SSatish Balay SSE_INLINE_END_2 108528e30874SSatish Balay 108628e30874SSatish Balay v += 16; 108728e30874SSatish Balay } 108828e30874SSatish Balay idx = 4 * i; 108928e30874SSatish Balay v = aa + 16 * ai[++i]; 109028e30874SSatish Balay PREFETCH_NTA(v); 109128e30874SSatish Balay STORE_PS(tmps, XMM7); 109228e30874SSatish Balay 109328e30874SSatish Balay /* Promote result from float to double */ 109428e30874SSatish Balay CONVERT_FLOAT4_DOUBLE4(&t[idx], tmps); 109528e30874SSatish Balay } 109628e30874SSatish Balay /* backward solve the upper triangular */ 109728e30874SSatish Balay idt = 4 * (n - 1); 109828e30874SSatish Balay ai16 = 16 * diag[n - 1]; 109928e30874SSatish Balay v = aa + ai16 + 16; 110028e30874SSatish Balay for (i = n - 1; i >= 0;) { 110128e30874SSatish Balay PREFETCH_NTA(&v[8]); 110228e30874SSatish Balay vi = aj + diag[i] + 1; 110328e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 110428e30874SSatish Balay 110528e30874SSatish Balay /* Demote accumulator from double to float */ 110628e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmps, &t[idt]); 110728e30874SSatish Balay LOAD_PS(tmps, XMM7); 110828e30874SSatish Balay 110928e30874SSatish Balay while (nz--) { 111028e30874SSatish Balay PREFETCH_NTA(&v[16]); 111128e30874SSatish Balay idx = 4 * (*vi++); 111228e30874SSatish Balay 111328e30874SSatish Balay /* Demote solution (so far) from double to float */ 111428e30874SSatish Balay CONVERT_DOUBLE4_FLOAT4(tmpx, &t[idx]); 111528e30874SSatish Balay 111628e30874SSatish Balay /* 4x4 Matrix-Vector Product with negative accumulation: */ 111728e30874SSatish Balay SSE_INLINE_BEGIN_2(tmpx, v) 111828e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM6) 111928e30874SSatish Balay 112028e30874SSatish Balay /* First Column */ 112128e30874SSatish Balay SSE_COPY_PS(XMM0, XMM6) 112228e30874SSatish Balay SSE_SHUFFLE(XMM0, XMM0, 0x00) 112328e30874SSatish Balay SSE_MULT_PS_M(XMM0, SSE_ARG_2, FLOAT_0) 112428e30874SSatish Balay SSE_SUB_PS(XMM7, XMM0) 112528e30874SSatish Balay 112628e30874SSatish Balay /* Second Column */ 112728e30874SSatish Balay SSE_COPY_PS(XMM1, XMM6) 112828e30874SSatish Balay SSE_SHUFFLE(XMM1, XMM1, 0x55) 112928e30874SSatish Balay SSE_MULT_PS_M(XMM1, SSE_ARG_2, FLOAT_4) 113028e30874SSatish Balay SSE_SUB_PS(XMM7, XMM1) 113128e30874SSatish Balay 113228e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_2, FLOAT_24) 113328e30874SSatish Balay 113428e30874SSatish Balay /* Third Column */ 113528e30874SSatish Balay SSE_COPY_PS(XMM2, XMM6) 113628e30874SSatish Balay SSE_SHUFFLE(XMM2, XMM2, 0xAA) 113728e30874SSatish Balay SSE_MULT_PS_M(XMM2, SSE_ARG_2, FLOAT_8) 113828e30874SSatish Balay SSE_SUB_PS(XMM7, XMM2) 113928e30874SSatish Balay 114028e30874SSatish Balay /* Fourth Column */ 114128e30874SSatish Balay SSE_COPY_PS(XMM3, XMM6) 114228e30874SSatish Balay SSE_SHUFFLE(XMM3, XMM3, 0xFF) 114328e30874SSatish Balay SSE_MULT_PS_M(XMM3, SSE_ARG_2, FLOAT_12) 114428e30874SSatish Balay SSE_SUB_PS(XMM7, XMM3) 114528e30874SSatish Balay SSE_INLINE_END_2 114628e30874SSatish Balay v += 16; 114728e30874SSatish Balay } 114828e30874SSatish Balay v = aa + ai16; 114928e30874SSatish Balay ai16 = 16 * diag[--i]; 115028e30874SSatish Balay PREFETCH_NTA(aa + ai16 + 16); 115128e30874SSatish Balay /* 115228e30874SSatish Balay Scale the result by the diagonal 4x4 block, 115328e30874SSatish Balay which was inverted as part of the factorization 115428e30874SSatish Balay */ 115528e30874SSatish Balay SSE_INLINE_BEGIN_3(v, tmps, aa + ai16) 115628e30874SSatish Balay /* First Column */ 115728e30874SSatish Balay SSE_COPY_PS(XMM0, XMM7) 115828e30874SSatish Balay SSE_SHUFFLE(XMM0, XMM0, 0x00) 115928e30874SSatish Balay SSE_MULT_PS_M(XMM0, SSE_ARG_1, FLOAT_0) 116028e30874SSatish Balay 116128e30874SSatish Balay /* Second Column */ 116228e30874SSatish Balay SSE_COPY_PS(XMM1, XMM7) 116328e30874SSatish Balay SSE_SHUFFLE(XMM1, XMM1, 0x55) 116428e30874SSatish Balay SSE_MULT_PS_M(XMM1, SSE_ARG_1, FLOAT_4) 116528e30874SSatish Balay SSE_ADD_PS(XMM0, XMM1) 116628e30874SSatish Balay 116728e30874SSatish Balay SSE_PREFETCH_NTA(SSE_ARG_3, FLOAT_24) 116828e30874SSatish Balay 116928e30874SSatish Balay /* Third Column */ 117028e30874SSatish Balay SSE_COPY_PS(XMM2, XMM7) 117128e30874SSatish Balay SSE_SHUFFLE(XMM2, XMM2, 0xAA) 117228e30874SSatish Balay SSE_MULT_PS_M(XMM2, SSE_ARG_1, FLOAT_8) 117328e30874SSatish Balay SSE_ADD_PS(XMM0, XMM2) 117428e30874SSatish Balay 117528e30874SSatish Balay /* Fourth Column */ 117628e30874SSatish Balay SSE_COPY_PS(XMM3, XMM7) 117728e30874SSatish Balay SSE_SHUFFLE(XMM3, XMM3, 0xFF) 117828e30874SSatish Balay SSE_MULT_PS_M(XMM3, SSE_ARG_1, FLOAT_12) 117928e30874SSatish Balay SSE_ADD_PS(XMM0, XMM3) 118028e30874SSatish Balay 118128e30874SSatish Balay SSE_STORE_PS(SSE_ARG_2, FLOAT_0, XMM0) 118228e30874SSatish Balay SSE_INLINE_END_3 118328e30874SSatish Balay 118428e30874SSatish Balay /* Promote solution from float to double */ 118528e30874SSatish Balay CONVERT_FLOAT4_DOUBLE4(&t[idt], tmps); 118628e30874SSatish Balay 118728e30874SSatish Balay /* Apply reordering to t and stream into x. */ 118828e30874SSatish Balay /* This way, x doesn't pollute the cache. */ 118928e30874SSatish Balay /* Be careful with size: 2 doubles = 4 floats! */ 119028e30874SSatish Balay idc = 4 * (*c--); 119128e30874SSatish Balay SSE_INLINE_BEGIN_2((float *)&t[idt], (float *)&x[idc]) 119228e30874SSatish Balay /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 119328e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_0, XMM0) 119428e30874SSatish Balay SSE_STREAM_PS(SSE_ARG_2, FLOAT_0, XMM0) 119528e30874SSatish Balay /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 119628e30874SSatish Balay SSE_LOAD_PS(SSE_ARG_1, FLOAT_4, XMM1) 119728e30874SSatish Balay SSE_STREAM_PS(SSE_ARG_2, FLOAT_4, XMM1) 119828e30874SSatish Balay SSE_INLINE_END_2 119928e30874SSatish Balay v = aa + ai16 + 16; 120028e30874SSatish Balay idt -= 4; 120128e30874SSatish Balay } 120228e30874SSatish Balay 12039566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 12049566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 12059566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(bb, &b)); 12069566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 12079566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 120828e30874SSatish Balay SSE_SCOPE_END; 120928e30874SSatish Balay PetscFunctionReturn(0); 121028e30874SSatish Balay } 121128e30874SSatish Balay 121228e30874SSatish Balay #endif 121328e30874SSatish Balay 1214*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A, Vec bb, Vec xx) { 121528e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 121628e30874SSatish Balay IS iscol = a->col, isrow = a->row; 121728e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 121828e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 121928e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 122028e30874SSatish Balay const MatScalar *aa = a->a, *v; 122128e30874SSatish Balay PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 122228e30874SSatish Balay const PetscScalar *b; 122328e30874SSatish Balay 122428e30874SSatish Balay PetscFunctionBegin; 12259566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 12269566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 122728e30874SSatish Balay t = a->solve_work; 122828e30874SSatish Balay 1229*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 1230*9371c9d4SSatish Balay r = rout; 1231*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 1232*9371c9d4SSatish Balay c = cout + (n - 1); 123328e30874SSatish Balay 123428e30874SSatish Balay /* forward solve the lower triangular */ 123528e30874SSatish Balay idx = 3 * (*r++); 1236*9371c9d4SSatish Balay t[0] = b[idx]; 1237*9371c9d4SSatish Balay t[1] = b[1 + idx]; 1238*9371c9d4SSatish Balay t[2] = b[2 + idx]; 123928e30874SSatish Balay for (i = 1; i < n; i++) { 124028e30874SSatish Balay v = aa + 9 * ai[i]; 124128e30874SSatish Balay vi = aj + ai[i]; 124228e30874SSatish Balay nz = diag[i] - ai[i]; 124328e30874SSatish Balay idx = 3 * (*r++); 1244*9371c9d4SSatish Balay s1 = b[idx]; 1245*9371c9d4SSatish Balay s2 = b[1 + idx]; 1246*9371c9d4SSatish Balay s3 = b[2 + idx]; 124728e30874SSatish Balay while (nz--) { 124828e30874SSatish Balay idx = 3 * (*vi++); 1249*9371c9d4SSatish Balay x1 = t[idx]; 1250*9371c9d4SSatish Balay x2 = t[1 + idx]; 1251*9371c9d4SSatish Balay x3 = t[2 + idx]; 125228e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 125328e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 125428e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 125528e30874SSatish Balay v += 9; 125628e30874SSatish Balay } 125728e30874SSatish Balay idx = 3 * i; 1258*9371c9d4SSatish Balay t[idx] = s1; 1259*9371c9d4SSatish Balay t[1 + idx] = s2; 1260*9371c9d4SSatish Balay t[2 + idx] = s3; 126128e30874SSatish Balay } 126228e30874SSatish Balay /* backward solve the upper triangular */ 126328e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 126428e30874SSatish Balay v = aa + 9 * diag[i] + 9; 126528e30874SSatish Balay vi = aj + diag[i] + 1; 126628e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 126728e30874SSatish Balay idt = 3 * i; 1268*9371c9d4SSatish Balay s1 = t[idt]; 1269*9371c9d4SSatish Balay s2 = t[1 + idt]; 1270*9371c9d4SSatish Balay s3 = t[2 + idt]; 127128e30874SSatish Balay while (nz--) { 127228e30874SSatish Balay idx = 3 * (*vi++); 1273*9371c9d4SSatish Balay x1 = t[idx]; 1274*9371c9d4SSatish Balay x2 = t[1 + idx]; 1275*9371c9d4SSatish Balay x3 = t[2 + idx]; 127628e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 127728e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 127828e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 127928e30874SSatish Balay v += 9; 128028e30874SSatish Balay } 128128e30874SSatish Balay idc = 3 * (*c--); 128228e30874SSatish Balay v = aa + 9 * diag[i]; 128328e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 128428e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 128528e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 128628e30874SSatish Balay } 12879566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 12889566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 12899566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 12909566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 12919566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 129228e30874SSatish Balay PetscFunctionReturn(0); 129328e30874SSatish Balay } 129428e30874SSatish Balay 1295*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_3(Mat A, Vec bb, Vec xx) { 129628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 129728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 129828e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 129928e30874SSatish Balay PetscInt i, nz, idx, idt, idc, m; 130028e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 130128e30874SSatish Balay const MatScalar *aa = a->a, *v; 130228e30874SSatish Balay PetscScalar *x, s1, s2, s3, x1, x2, x3, *t; 130328e30874SSatish Balay const PetscScalar *b; 130428e30874SSatish Balay 130528e30874SSatish Balay PetscFunctionBegin; 13069566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 13079566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 130828e30874SSatish Balay t = a->solve_work; 130928e30874SSatish Balay 1310*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 1311*9371c9d4SSatish Balay r = rout; 1312*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 1313*9371c9d4SSatish Balay c = cout; 131428e30874SSatish Balay 131528e30874SSatish Balay /* forward solve the lower triangular */ 131628e30874SSatish Balay idx = 3 * r[0]; 1317*9371c9d4SSatish Balay t[0] = b[idx]; 1318*9371c9d4SSatish Balay t[1] = b[1 + idx]; 1319*9371c9d4SSatish Balay t[2] = b[2 + idx]; 132028e30874SSatish Balay for (i = 1; i < n; i++) { 132128e30874SSatish Balay v = aa + 9 * ai[i]; 132228e30874SSatish Balay vi = aj + ai[i]; 132328e30874SSatish Balay nz = ai[i + 1] - ai[i]; 132428e30874SSatish Balay idx = 3 * r[i]; 1325*9371c9d4SSatish Balay s1 = b[idx]; 1326*9371c9d4SSatish Balay s2 = b[1 + idx]; 1327*9371c9d4SSatish Balay s3 = b[2 + idx]; 132828e30874SSatish Balay for (m = 0; m < nz; m++) { 132928e30874SSatish Balay idx = 3 * vi[m]; 1330*9371c9d4SSatish Balay x1 = t[idx]; 1331*9371c9d4SSatish Balay x2 = t[1 + idx]; 1332*9371c9d4SSatish Balay x3 = t[2 + idx]; 133328e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 133428e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 133528e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 133628e30874SSatish Balay v += 9; 133728e30874SSatish Balay } 133828e30874SSatish Balay idx = 3 * i; 1339*9371c9d4SSatish Balay t[idx] = s1; 1340*9371c9d4SSatish Balay t[1 + idx] = s2; 1341*9371c9d4SSatish Balay t[2 + idx] = s3; 134228e30874SSatish Balay } 134328e30874SSatish Balay /* backward solve the upper triangular */ 134428e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 134528e30874SSatish Balay v = aa + 9 * (adiag[i + 1] + 1); 134628e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 134728e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 134828e30874SSatish Balay idt = 3 * i; 1349*9371c9d4SSatish Balay s1 = t[idt]; 1350*9371c9d4SSatish Balay s2 = t[1 + idt]; 1351*9371c9d4SSatish Balay s3 = t[2 + idt]; 135228e30874SSatish Balay for (m = 0; m < nz; m++) { 135328e30874SSatish Balay idx = 3 * vi[m]; 1354*9371c9d4SSatish Balay x1 = t[idx]; 1355*9371c9d4SSatish Balay x2 = t[1 + idx]; 1356*9371c9d4SSatish Balay x3 = t[2 + idx]; 135728e30874SSatish Balay s1 -= v[0] * x1 + v[3] * x2 + v[6] * x3; 135828e30874SSatish Balay s2 -= v[1] * x1 + v[4] * x2 + v[7] * x3; 135928e30874SSatish Balay s3 -= v[2] * x1 + v[5] * x2 + v[8] * x3; 136028e30874SSatish Balay v += 9; 136128e30874SSatish Balay } 136228e30874SSatish Balay idc = 3 * c[i]; 136328e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[3] * s2 + v[6] * s3; 136428e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[4] * s2 + v[7] * s3; 136528e30874SSatish Balay x[2 + idc] = t[2 + idt] = v[2] * s1 + v[5] * s2 + v[8] * s3; 136628e30874SSatish Balay } 13679566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 13689566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 13699566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 13709566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 13719566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 9 * (a->nz) - 3.0 * A->cmap->n)); 137228e30874SSatish Balay PetscFunctionReturn(0); 137328e30874SSatish Balay } 137428e30874SSatish Balay 1375*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A, Vec bb, Vec xx) { 137628e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 137728e30874SSatish Balay IS iscol = a->col, isrow = a->row; 137828e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 137928e30874SSatish Balay PetscInt i, nz, idx, idt, idc; 138028e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 138128e30874SSatish Balay const MatScalar *aa = a->a, *v; 138228e30874SSatish Balay PetscScalar *x, s1, s2, x1, x2, *t; 138328e30874SSatish Balay const PetscScalar *b; 138428e30874SSatish Balay 138528e30874SSatish Balay PetscFunctionBegin; 13869566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 13879566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 138828e30874SSatish Balay t = a->solve_work; 138928e30874SSatish Balay 1390*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 1391*9371c9d4SSatish Balay r = rout; 1392*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 1393*9371c9d4SSatish Balay c = cout + (n - 1); 139428e30874SSatish Balay 139528e30874SSatish Balay /* forward solve the lower triangular */ 139628e30874SSatish Balay idx = 2 * (*r++); 1397*9371c9d4SSatish Balay t[0] = b[idx]; 1398*9371c9d4SSatish Balay t[1] = b[1 + idx]; 139928e30874SSatish Balay for (i = 1; i < n; i++) { 140028e30874SSatish Balay v = aa + 4 * ai[i]; 140128e30874SSatish Balay vi = aj + ai[i]; 140228e30874SSatish Balay nz = diag[i] - ai[i]; 140328e30874SSatish Balay idx = 2 * (*r++); 1404*9371c9d4SSatish Balay s1 = b[idx]; 1405*9371c9d4SSatish Balay s2 = b[1 + idx]; 140628e30874SSatish Balay while (nz--) { 140728e30874SSatish Balay idx = 2 * (*vi++); 1408*9371c9d4SSatish Balay x1 = t[idx]; 1409*9371c9d4SSatish Balay x2 = t[1 + idx]; 141028e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 141128e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 141228e30874SSatish Balay v += 4; 141328e30874SSatish Balay } 141428e30874SSatish Balay idx = 2 * i; 1415*9371c9d4SSatish Balay t[idx] = s1; 1416*9371c9d4SSatish Balay t[1 + idx] = s2; 141728e30874SSatish Balay } 141828e30874SSatish Balay /* backward solve the upper triangular */ 141928e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 142028e30874SSatish Balay v = aa + 4 * diag[i] + 4; 142128e30874SSatish Balay vi = aj + diag[i] + 1; 142228e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 142328e30874SSatish Balay idt = 2 * i; 1424*9371c9d4SSatish Balay s1 = t[idt]; 1425*9371c9d4SSatish Balay s2 = t[1 + idt]; 142628e30874SSatish Balay while (nz--) { 142728e30874SSatish Balay idx = 2 * (*vi++); 1428*9371c9d4SSatish Balay x1 = t[idx]; 1429*9371c9d4SSatish Balay x2 = t[1 + idx]; 143028e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 143128e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 143228e30874SSatish Balay v += 4; 143328e30874SSatish Balay } 143428e30874SSatish Balay idc = 2 * (*c--); 143528e30874SSatish Balay v = aa + 4 * diag[i]; 143628e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 143728e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 143828e30874SSatish Balay } 14399566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 14409566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 14419566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 14429566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 14439566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 144428e30874SSatish Balay PetscFunctionReturn(0); 144528e30874SSatish Balay } 144628e30874SSatish Balay 1447*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_2(Mat A, Vec bb, Vec xx) { 144828e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 144928e30874SSatish Balay IS iscol = a->col, isrow = a->row; 145028e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 145128e30874SSatish Balay PetscInt i, nz, idx, jdx, idt, idc, m; 145228e30874SSatish Balay const PetscInt *r, *c, *rout, *cout; 145328e30874SSatish Balay const MatScalar *aa = a->a, *v; 145428e30874SSatish Balay PetscScalar *x, s1, s2, x1, x2, *t; 145528e30874SSatish Balay const PetscScalar *b; 145628e30874SSatish Balay 145728e30874SSatish Balay PetscFunctionBegin; 14589566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 14599566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 146028e30874SSatish Balay t = a->solve_work; 146128e30874SSatish Balay 1462*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 1463*9371c9d4SSatish Balay r = rout; 1464*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 1465*9371c9d4SSatish Balay c = cout; 146628e30874SSatish Balay 146728e30874SSatish Balay /* forward solve the lower triangular */ 146828e30874SSatish Balay idx = 2 * r[0]; 1469*9371c9d4SSatish Balay t[0] = b[idx]; 1470*9371c9d4SSatish Balay t[1] = b[1 + idx]; 147128e30874SSatish Balay for (i = 1; i < n; i++) { 147228e30874SSatish Balay v = aa + 4 * ai[i]; 147328e30874SSatish Balay vi = aj + ai[i]; 147428e30874SSatish Balay nz = ai[i + 1] - ai[i]; 147528e30874SSatish Balay idx = 2 * r[i]; 1476*9371c9d4SSatish Balay s1 = b[idx]; 1477*9371c9d4SSatish Balay s2 = b[1 + idx]; 147828e30874SSatish Balay for (m = 0; m < nz; m++) { 147928e30874SSatish Balay jdx = 2 * vi[m]; 1480*9371c9d4SSatish Balay x1 = t[jdx]; 1481*9371c9d4SSatish Balay x2 = t[1 + jdx]; 148228e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 148328e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 148428e30874SSatish Balay v += 4; 148528e30874SSatish Balay } 148628e30874SSatish Balay idx = 2 * i; 1487*9371c9d4SSatish Balay t[idx] = s1; 1488*9371c9d4SSatish Balay t[1 + idx] = s2; 148928e30874SSatish Balay } 149028e30874SSatish Balay /* backward solve the upper triangular */ 149128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 149228e30874SSatish Balay v = aa + 4 * (adiag[i + 1] + 1); 149328e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 149428e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 149528e30874SSatish Balay idt = 2 * i; 1496*9371c9d4SSatish Balay s1 = t[idt]; 1497*9371c9d4SSatish Balay s2 = t[1 + idt]; 149828e30874SSatish Balay for (m = 0; m < nz; m++) { 149928e30874SSatish Balay idx = 2 * vi[m]; 1500*9371c9d4SSatish Balay x1 = t[idx]; 1501*9371c9d4SSatish Balay x2 = t[1 + idx]; 150228e30874SSatish Balay s1 -= v[0] * x1 + v[2] * x2; 150328e30874SSatish Balay s2 -= v[1] * x1 + v[3] * x2; 150428e30874SSatish Balay v += 4; 150528e30874SSatish Balay } 150628e30874SSatish Balay idc = 2 * c[i]; 150728e30874SSatish Balay x[idc] = t[idt] = v[0] * s1 + v[2] * s2; 150828e30874SSatish Balay x[1 + idc] = t[1 + idt] = v[1] * s1 + v[3] * s2; 150928e30874SSatish Balay } 15109566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 15119566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 15129566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 15139566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 15149566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 4 * (a->nz) - 2.0 * A->cmap->n)); 151528e30874SSatish Balay PetscFunctionReturn(0); 151628e30874SSatish Balay } 151728e30874SSatish Balay 1518*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A, Vec bb, Vec xx) { 151928e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 152028e30874SSatish Balay IS iscol = a->col, isrow = a->row; 152128e30874SSatish Balay const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j; 152228e30874SSatish Balay PetscInt i, nz; 152328e30874SSatish Balay const PetscInt *r, *c, *diag = a->diag, *rout, *cout; 152428e30874SSatish Balay const MatScalar *aa = a->a, *v; 152528e30874SSatish Balay PetscScalar *x, s1, *t; 152628e30874SSatish Balay const PetscScalar *b; 152728e30874SSatish Balay 152828e30874SSatish Balay PetscFunctionBegin; 152928e30874SSatish Balay if (!n) PetscFunctionReturn(0); 153028e30874SSatish Balay 15319566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 15329566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 153328e30874SSatish Balay t = a->solve_work; 153428e30874SSatish Balay 1535*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 1536*9371c9d4SSatish Balay r = rout; 1537*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 1538*9371c9d4SSatish Balay c = cout + (n - 1); 153928e30874SSatish Balay 154028e30874SSatish Balay /* forward solve the lower triangular */ 154128e30874SSatish Balay t[0] = b[*r++]; 154228e30874SSatish Balay for (i = 1; i < n; i++) { 154328e30874SSatish Balay v = aa + ai[i]; 154428e30874SSatish Balay vi = aj + ai[i]; 154528e30874SSatish Balay nz = diag[i] - ai[i]; 154628e30874SSatish Balay s1 = b[*r++]; 1547*9371c9d4SSatish Balay while (nz--) { s1 -= (*v++) * t[*vi++]; } 154828e30874SSatish Balay t[i] = s1; 154928e30874SSatish Balay } 155028e30874SSatish Balay /* backward solve the upper triangular */ 155128e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 155228e30874SSatish Balay v = aa + diag[i] + 1; 155328e30874SSatish Balay vi = aj + diag[i] + 1; 155428e30874SSatish Balay nz = ai[i + 1] - diag[i] - 1; 155528e30874SSatish Balay s1 = t[i]; 1556*9371c9d4SSatish Balay while (nz--) { s1 -= (*v++) * t[*vi++]; } 155728e30874SSatish Balay x[*c--] = t[i] = aa[diag[i]] * s1; 155828e30874SSatish Balay } 155928e30874SSatish Balay 15609566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 15619566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 15629566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 15639566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 15649566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * 1 * (a->nz) - A->cmap->n)); 156528e30874SSatish Balay PetscFunctionReturn(0); 156628e30874SSatish Balay } 156728e30874SSatish Balay 1568*9371c9d4SSatish Balay PetscErrorCode MatSolve_SeqBAIJ_1(Mat A, Vec bb, Vec xx) { 156928e30874SSatish Balay Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 157028e30874SSatish Balay IS iscol = a->col, isrow = a->row; 157128e30874SSatish Balay PetscInt i, n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag, nz; 157228e30874SSatish Balay const PetscInt *rout, *cout, *r, *c; 157328e30874SSatish Balay PetscScalar *x, *tmp, sum; 157428e30874SSatish Balay const PetscScalar *b; 157528e30874SSatish Balay const MatScalar *aa = a->a, *v; 157628e30874SSatish Balay 157728e30874SSatish Balay PetscFunctionBegin; 157828e30874SSatish Balay if (!n) PetscFunctionReturn(0); 157928e30874SSatish Balay 15809566063dSJacob Faibussowitsch PetscCall(VecGetArrayRead(bb, &b)); 15819566063dSJacob Faibussowitsch PetscCall(VecGetArray(xx, &x)); 158228e30874SSatish Balay tmp = a->solve_work; 158328e30874SSatish Balay 1584*9371c9d4SSatish Balay PetscCall(ISGetIndices(isrow, &rout)); 1585*9371c9d4SSatish Balay r = rout; 1586*9371c9d4SSatish Balay PetscCall(ISGetIndices(iscol, &cout)); 1587*9371c9d4SSatish Balay c = cout; 158828e30874SSatish Balay 158928e30874SSatish Balay /* forward solve the lower triangular */ 159028e30874SSatish Balay tmp[0] = b[r[0]]; 159128e30874SSatish Balay v = aa; 159228e30874SSatish Balay vi = aj; 159328e30874SSatish Balay for (i = 1; i < n; i++) { 159428e30874SSatish Balay nz = ai[i + 1] - ai[i]; 159528e30874SSatish Balay sum = b[r[i]]; 159628e30874SSatish Balay PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 159728e30874SSatish Balay tmp[i] = sum; 1598*9371c9d4SSatish Balay v += nz; 1599*9371c9d4SSatish Balay vi += nz; 160028e30874SSatish Balay } 160128e30874SSatish Balay 160228e30874SSatish Balay /* backward solve the upper triangular */ 160328e30874SSatish Balay for (i = n - 1; i >= 0; i--) { 160428e30874SSatish Balay v = aa + adiag[i + 1] + 1; 160528e30874SSatish Balay vi = aj + adiag[i + 1] + 1; 160628e30874SSatish Balay nz = adiag[i] - adiag[i + 1] - 1; 160728e30874SSatish Balay sum = tmp[i]; 160828e30874SSatish Balay PetscSparseDenseMinusDot(sum, tmp, v, vi, nz); 160928e30874SSatish Balay x[c[i]] = tmp[i] = sum * v[nz]; /* v[nz] = aa[adiag[i]] */ 161028e30874SSatish Balay } 161128e30874SSatish Balay 16129566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(isrow, &rout)); 16139566063dSJacob Faibussowitsch PetscCall(ISRestoreIndices(iscol, &cout)); 16149566063dSJacob Faibussowitsch PetscCall(VecRestoreArrayRead(bb, &b)); 16159566063dSJacob Faibussowitsch PetscCall(VecRestoreArray(xx, &x)); 16169566063dSJacob Faibussowitsch PetscCall(PetscLogFlops(2.0 * a->nz - A->cmap->n)); 161728e30874SSatish Balay PetscFunctionReturn(0); 161828e30874SSatish Balay } 1619