1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 5 { 6 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 7 const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 8 PetscInt i, nz, idx, idt, jdx; 9 const MatScalar *aa = a->a, *v; 10 PetscScalar *x, s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7; 11 const PetscScalar *b; 12 13 PetscFunctionBegin; 14 PetscCall(VecGetArrayRead(bb, &b)); 15 PetscCall(VecGetArray(xx, &x)); 16 /* forward solve the lower triangular */ 17 idx = 0; 18 x[0] = b[idx]; 19 x[1] = b[1 + idx]; 20 x[2] = b[2 + idx]; 21 x[3] = b[3 + idx]; 22 x[4] = b[4 + idx]; 23 x[5] = b[5 + idx]; 24 x[6] = b[6 + idx]; 25 for (i = 1; i < n; i++) { 26 v = aa + 49 * ai[i]; 27 vi = aj + ai[i]; 28 nz = diag[i] - ai[i]; 29 idx = 7 * i; 30 s1 = b[idx]; 31 s2 = b[1 + idx]; 32 s3 = b[2 + idx]; 33 s4 = b[3 + idx]; 34 s5 = b[4 + idx]; 35 s6 = b[5 + idx]; 36 s7 = b[6 + idx]; 37 while (nz--) { 38 jdx = 7 * (*vi++); 39 x1 = x[jdx]; 40 x2 = x[1 + jdx]; 41 x3 = x[2 + jdx]; 42 x4 = x[3 + jdx]; 43 x5 = x[4 + jdx]; 44 x6 = x[5 + jdx]; 45 x7 = x[6 + jdx]; 46 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 47 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 48 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 49 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 50 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 51 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 52 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 53 v += 49; 54 } 55 x[idx] = s1; 56 x[1 + idx] = s2; 57 x[2 + idx] = s3; 58 x[3 + idx] = s4; 59 x[4 + idx] = s5; 60 x[5 + idx] = s6; 61 x[6 + idx] = s7; 62 } 63 /* backward solve the upper triangular */ 64 for (i = n - 1; i >= 0; i--) { 65 v = aa + 49 * diag[i] + 49; 66 vi = aj + diag[i] + 1; 67 nz = ai[i + 1] - diag[i] - 1; 68 idt = 7 * i; 69 s1 = x[idt]; 70 s2 = x[1 + idt]; 71 s3 = x[2 + idt]; 72 s4 = x[3 + idt]; 73 s5 = x[4 + idt]; 74 s6 = x[5 + idt]; 75 s7 = x[6 + idt]; 76 while (nz--) { 77 idx = 7 * (*vi++); 78 x1 = x[idx]; 79 x2 = x[1 + idx]; 80 x3 = x[2 + idx]; 81 x4 = x[3 + idx]; 82 x5 = x[4 + idx]; 83 x6 = x[5 + idx]; 84 x7 = x[6 + idx]; 85 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 86 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 87 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 88 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 89 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 90 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 91 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 92 v += 49; 93 } 94 v = aa + 49 * diag[i]; 95 x[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 96 x[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 97 x[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 98 x[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 99 x[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 100 x[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 101 x[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 102 } 103 104 PetscCall(VecRestoreArrayRead(bb, &b)); 105 PetscCall(VecRestoreArray(xx, &x)); 106 PetscCall(PetscLogFlops(2.0 * 36 * (a->nz) - 6.0 * A->cmap->n)); 107 PetscFunctionReturn(0); 108 } 109 110 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A, Vec bb, Vec xx) 111 { 112 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 113 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *adiag = a->diag; 114 PetscInt i, k, nz, idx, jdx, idt; 115 const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 116 const MatScalar *aa = a->a, *v; 117 PetscScalar *x; 118 const PetscScalar *b; 119 PetscScalar s1, s2, s3, s4, s5, s6, s7, x1, x2, x3, x4, x5, x6, x7; 120 121 PetscFunctionBegin; 122 PetscCall(VecGetArrayRead(bb, &b)); 123 PetscCall(VecGetArray(xx, &x)); 124 /* forward solve the lower triangular */ 125 idx = 0; 126 x[0] = b[idx]; 127 x[1] = b[1 + idx]; 128 x[2] = b[2 + idx]; 129 x[3] = b[3 + idx]; 130 x[4] = b[4 + idx]; 131 x[5] = b[5 + idx]; 132 x[6] = b[6 + idx]; 133 for (i = 1; i < n; i++) { 134 v = aa + bs2 * ai[i]; 135 vi = aj + ai[i]; 136 nz = ai[i + 1] - ai[i]; 137 idx = bs * i; 138 s1 = b[idx]; 139 s2 = b[1 + idx]; 140 s3 = b[2 + idx]; 141 s4 = b[3 + idx]; 142 s5 = b[4 + idx]; 143 s6 = b[5 + idx]; 144 s7 = b[6 + idx]; 145 for (k = 0; k < nz; k++) { 146 jdx = bs * vi[k]; 147 x1 = x[jdx]; 148 x2 = x[1 + jdx]; 149 x3 = x[2 + jdx]; 150 x4 = x[3 + jdx]; 151 x5 = x[4 + jdx]; 152 x6 = x[5 + jdx]; 153 x7 = x[6 + jdx]; 154 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 155 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 156 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 157 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 158 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 159 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 160 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 161 v += bs2; 162 } 163 164 x[idx] = s1; 165 x[1 + idx] = s2; 166 x[2 + idx] = s3; 167 x[3 + idx] = s4; 168 x[4 + idx] = s5; 169 x[5 + idx] = s6; 170 x[6 + idx] = s7; 171 } 172 173 /* backward solve the upper triangular */ 174 for (i = n - 1; i >= 0; i--) { 175 v = aa + bs2 * (adiag[i + 1] + 1); 176 vi = aj + adiag[i + 1] + 1; 177 nz = adiag[i] - adiag[i + 1] - 1; 178 idt = bs * i; 179 s1 = x[idt]; 180 s2 = x[1 + idt]; 181 s3 = x[2 + idt]; 182 s4 = x[3 + idt]; 183 s5 = x[4 + idt]; 184 s6 = x[5 + idt]; 185 s7 = x[6 + idt]; 186 for (k = 0; k < nz; k++) { 187 idx = bs * vi[k]; 188 x1 = x[idx]; 189 x2 = x[1 + idx]; 190 x3 = x[2 + idx]; 191 x4 = x[3 + idx]; 192 x5 = x[4 + idx]; 193 x6 = x[5 + idx]; 194 x7 = x[6 + idx]; 195 s1 -= v[0] * x1 + v[7] * x2 + v[14] * x3 + v[21] * x4 + v[28] * x5 + v[35] * x6 + v[42] * x7; 196 s2 -= v[1] * x1 + v[8] * x2 + v[15] * x3 + v[22] * x4 + v[29] * x5 + v[36] * x6 + v[43] * x7; 197 s3 -= v[2] * x1 + v[9] * x2 + v[16] * x3 + v[23] * x4 + v[30] * x5 + v[37] * x6 + v[44] * x7; 198 s4 -= v[3] * x1 + v[10] * x2 + v[17] * x3 + v[24] * x4 + v[31] * x5 + v[38] * x6 + v[45] * x7; 199 s5 -= v[4] * x1 + v[11] * x2 + v[18] * x3 + v[25] * x4 + v[32] * x5 + v[39] * x6 + v[46] * x7; 200 s6 -= v[5] * x1 + v[12] * x2 + v[19] * x3 + v[26] * x4 + v[33] * x5 + v[40] * x6 + v[47] * x7; 201 s7 -= v[6] * x1 + v[13] * x2 + v[20] * x3 + v[27] * x4 + v[34] * x5 + v[41] * x6 + v[48] * x7; 202 v += bs2; 203 } 204 /* x = inv_diagonal*x */ 205 x[idt] = v[0] * s1 + v[7] * s2 + v[14] * s3 + v[21] * s4 + v[28] * s5 + v[35] * s6 + v[42] * s7; 206 x[1 + idt] = v[1] * s1 + v[8] * s2 + v[15] * s3 + v[22] * s4 + v[29] * s5 + v[36] * s6 + v[43] * s7; 207 x[2 + idt] = v[2] * s1 + v[9] * s2 + v[16] * s3 + v[23] * s4 + v[30] * s5 + v[37] * s6 + v[44] * s7; 208 x[3 + idt] = v[3] * s1 + v[10] * s2 + v[17] * s3 + v[24] * s4 + v[31] * s5 + v[38] * s6 + v[45] * s7; 209 x[4 + idt] = v[4] * s1 + v[11] * s2 + v[18] * s3 + v[25] * s4 + v[32] * s5 + v[39] * s6 + v[46] * s7; 210 x[5 + idt] = v[5] * s1 + v[12] * s2 + v[19] * s3 + v[26] * s4 + v[33] * s5 + v[40] * s6 + v[47] * s7; 211 x[6 + idt] = v[6] * s1 + v[13] * s2 + v[20] * s3 + v[27] * s4 + v[34] * s5 + v[41] * s6 + v[48] * s7; 212 } 213 214 PetscCall(VecRestoreArrayRead(bb, &b)); 215 PetscCall(VecRestoreArray(xx, &x)); 216 PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n)); 217 PetscFunctionReturn(0); 218 } 219