1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 5 { 6 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 7 PetscInt i, nz, idx, idt, jdx; 8 const PetscInt *diag = a->diag, *vi, n = a->mbs, *ai = a->i, *aj = a->j; 9 const MatScalar *aa = a->a, *v; 10 PetscScalar *x, s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6; 11 const PetscScalar *b; 12 13 PetscFunctionBegin; 14 PetscCall(VecGetArrayRead(bb, &b)); 15 PetscCall(VecGetArray(xx, &x)); 16 /* forward solve the lower triangular */ 17 idx = 0; 18 x[0] = b[idx]; 19 x[1] = b[1 + idx]; 20 x[2] = b[2 + idx]; 21 x[3] = b[3 + idx]; 22 x[4] = b[4 + idx]; 23 x[5] = b[5 + idx]; 24 for (i = 1; i < n; i++) { 25 v = aa + 36 * ai[i]; 26 vi = aj + ai[i]; 27 nz = diag[i] - ai[i]; 28 idx = 6 * i; 29 s1 = b[idx]; 30 s2 = b[1 + idx]; 31 s3 = b[2 + idx]; 32 s4 = b[3 + idx]; 33 s5 = b[4 + idx]; 34 s6 = b[5 + idx]; 35 while (nz--) { 36 jdx = 6 * (*vi++); 37 x1 = x[jdx]; 38 x2 = x[1 + jdx]; 39 x3 = x[2 + jdx]; 40 x4 = x[3 + jdx]; 41 x5 = x[4 + jdx]; 42 x6 = x[5 + jdx]; 43 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 44 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 45 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 46 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 47 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 48 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 49 v += 36; 50 } 51 x[idx] = s1; 52 x[1 + idx] = s2; 53 x[2 + idx] = s3; 54 x[3 + idx] = s4; 55 x[4 + idx] = s5; 56 x[5 + idx] = s6; 57 } 58 /* backward solve the upper triangular */ 59 for (i = n - 1; i >= 0; i--) { 60 v = aa + 36 * diag[i] + 36; 61 vi = aj + diag[i] + 1; 62 nz = ai[i + 1] - diag[i] - 1; 63 idt = 6 * i; 64 s1 = x[idt]; 65 s2 = x[1 + idt]; 66 s3 = x[2 + idt]; 67 s4 = x[3 + idt]; 68 s5 = x[4 + idt]; 69 s6 = x[5 + idt]; 70 while (nz--) { 71 idx = 6 * (*vi++); 72 x1 = x[idx]; 73 x2 = x[1 + idx]; 74 x3 = x[2 + idx]; 75 x4 = x[3 + idx]; 76 x5 = x[4 + idx]; 77 x6 = x[5 + idx]; 78 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 79 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 80 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 81 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 82 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 83 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 84 v += 36; 85 } 86 v = aa + 36 * diag[i]; 87 x[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 88 x[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 89 x[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 90 x[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 91 x[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 92 x[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 93 } 94 95 PetscCall(VecRestoreArrayRead(bb, &b)); 96 PetscCall(VecRestoreArray(xx, &x)); 97 PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 98 PetscFunctionReturn(PETSC_SUCCESS); 99 } 100 101 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A, Vec bb, Vec xx) 102 { 103 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 104 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 105 PetscInt i, k, nz, idx, jdx, idt; 106 const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 107 const MatScalar *aa = a->a, *v; 108 PetscScalar *x; 109 const PetscScalar *b; 110 PetscScalar s1, s2, s3, s4, s5, s6, x1, x2, x3, x4, x5, x6; 111 112 PetscFunctionBegin; 113 PetscCall(VecGetArrayRead(bb, &b)); 114 PetscCall(VecGetArray(xx, &x)); 115 /* forward solve the lower triangular */ 116 idx = 0; 117 x[0] = b[idx]; 118 x[1] = b[1 + idx]; 119 x[2] = b[2 + idx]; 120 x[3] = b[3 + idx]; 121 x[4] = b[4 + idx]; 122 x[5] = b[5 + idx]; 123 for (i = 1; i < n; i++) { 124 v = aa + bs2 * ai[i]; 125 vi = aj + ai[i]; 126 nz = ai[i + 1] - ai[i]; 127 idx = bs * i; 128 s1 = b[idx]; 129 s2 = b[1 + idx]; 130 s3 = b[2 + idx]; 131 s4 = b[3 + idx]; 132 s5 = b[4 + idx]; 133 s6 = b[5 + idx]; 134 for (k = 0; k < nz; k++) { 135 jdx = bs * vi[k]; 136 x1 = x[jdx]; 137 x2 = x[1 + jdx]; 138 x3 = x[2 + jdx]; 139 x4 = x[3 + jdx]; 140 x5 = x[4 + jdx]; 141 x6 = x[5 + jdx]; 142 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 143 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 144 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 145 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 146 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 147 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 148 v += bs2; 149 } 150 151 x[idx] = s1; 152 x[1 + idx] = s2; 153 x[2 + idx] = s3; 154 x[3 + idx] = s4; 155 x[4 + idx] = s5; 156 x[5 + idx] = s6; 157 } 158 159 /* backward solve the upper triangular */ 160 for (i = n - 1; i >= 0; i--) { 161 v = aa + bs2 * (adiag[i + 1] + 1); 162 vi = aj + adiag[i + 1] + 1; 163 nz = adiag[i] - adiag[i + 1] - 1; 164 idt = bs * i; 165 s1 = x[idt]; 166 s2 = x[1 + idt]; 167 s3 = x[2 + idt]; 168 s4 = x[3 + idt]; 169 s5 = x[4 + idt]; 170 s6 = x[5 + idt]; 171 for (k = 0; k < nz; k++) { 172 idx = bs * vi[k]; 173 x1 = x[idx]; 174 x2 = x[1 + idx]; 175 x3 = x[2 + idx]; 176 x4 = x[3 + idx]; 177 x5 = x[4 + idx]; 178 x6 = x[5 + idx]; 179 s1 -= v[0] * x1 + v[6] * x2 + v[12] * x3 + v[18] * x4 + v[24] * x5 + v[30] * x6; 180 s2 -= v[1] * x1 + v[7] * x2 + v[13] * x3 + v[19] * x4 + v[25] * x5 + v[31] * x6; 181 s3 -= v[2] * x1 + v[8] * x2 + v[14] * x3 + v[20] * x4 + v[26] * x5 + v[32] * x6; 182 s4 -= v[3] * x1 + v[9] * x2 + v[15] * x3 + v[21] * x4 + v[27] * x5 + v[33] * x6; 183 s5 -= v[4] * x1 + v[10] * x2 + v[16] * x3 + v[22] * x4 + v[28] * x5 + v[34] * x6; 184 s6 -= v[5] * x1 + v[11] * x2 + v[17] * x3 + v[23] * x4 + v[29] * x5 + v[35] * x6; 185 v += bs2; 186 } 187 /* x = inv_diagonal*x */ 188 x[idt] = v[0] * s1 + v[6] * s2 + v[12] * s3 + v[18] * s4 + v[24] * s5 + v[30] * s6; 189 x[1 + idt] = v[1] * s1 + v[7] * s2 + v[13] * s3 + v[19] * s4 + v[25] * s5 + v[31] * s6; 190 x[2 + idt] = v[2] * s1 + v[8] * s2 + v[14] * s3 + v[20] * s4 + v[26] * s5 + v[32] * s6; 191 x[3 + idt] = v[3] * s1 + v[9] * s2 + v[15] * s3 + v[21] * s4 + v[27] * s5 + v[33] * s6; 192 x[4 + idt] = v[4] * s1 + v[10] * s2 + v[16] * s3 + v[22] * s4 + v[28] * s5 + v[34] * s6; 193 x[5 + idt] = v[5] * s1 + v[11] * s2 + v[17] * s3 + v[23] * s4 + v[29] * s5 + v[35] * s6; 194 } 195 196 PetscCall(VecRestoreArrayRead(bb, &b)); 197 PetscCall(VecRestoreArray(xx, &x)); 198 PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n)); 199 PetscFunctionReturn(PETSC_SUCCESS); 200 } 201