1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 /* Block operations are done by accessing one column at at time */ 5 /* Default MatSolve for block size 14 */ 6 7 PetscErrorCode MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A, Vec bb, Vec xx) { 8 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 9 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi, bs = A->rmap->bs, bs2 = a->bs2; 10 PetscInt i, k, nz, idx, idt, m; 11 const MatScalar *aa = a->a, *v; 12 PetscScalar s[14]; 13 PetscScalar *x, xv; 14 const PetscScalar *b; 15 16 PetscFunctionBegin; 17 PetscCall(VecGetArrayRead(bb, &b)); 18 PetscCall(VecGetArray(xx, &x)); 19 20 /* forward solve the lower triangular */ 21 for (i = 0; i < n; i++) { 22 v = aa + bs2 * ai[i]; 23 vi = aj + ai[i]; 24 nz = ai[i + 1] - ai[i]; 25 idt = bs * i; 26 x[idt] = b[idt]; 27 x[1 + idt] = b[1 + idt]; 28 x[2 + idt] = b[2 + idt]; 29 x[3 + idt] = b[3 + idt]; 30 x[4 + idt] = b[4 + idt]; 31 x[5 + idt] = b[5 + idt]; 32 x[6 + idt] = b[6 + idt]; 33 x[7 + idt] = b[7 + idt]; 34 x[8 + idt] = b[8 + idt]; 35 x[9 + idt] = b[9 + idt]; 36 x[10 + idt] = b[10 + idt]; 37 x[11 + idt] = b[11 + idt]; 38 x[12 + idt] = b[12 + idt]; 39 x[13 + idt] = b[13 + idt]; 40 for (m = 0; m < nz; m++) { 41 idx = bs * vi[m]; 42 for (k = 0; k < bs; k++) { 43 xv = x[k + idx]; 44 x[idt] -= v[0] * xv; 45 x[1 + idt] -= v[1] * xv; 46 x[2 + idt] -= v[2] * xv; 47 x[3 + idt] -= v[3] * xv; 48 x[4 + idt] -= v[4] * xv; 49 x[5 + idt] -= v[5] * xv; 50 x[6 + idt] -= v[6] * xv; 51 x[7 + idt] -= v[7] * xv; 52 x[8 + idt] -= v[8] * xv; 53 x[9 + idt] -= v[9] * xv; 54 x[10 + idt] -= v[10] * xv; 55 x[11 + idt] -= v[11] * xv; 56 x[12 + idt] -= v[12] * xv; 57 x[13 + idt] -= v[13] * xv; 58 v += 14; 59 } 60 } 61 } 62 /* backward solve the upper triangular */ 63 for (i = n - 1; i >= 0; i--) { 64 v = aa + bs2 * (adiag[i + 1] + 1); 65 vi = aj + adiag[i + 1] + 1; 66 nz = adiag[i] - adiag[i + 1] - 1; 67 idt = bs * i; 68 s[0] = x[idt]; 69 s[1] = x[1 + idt]; 70 s[2] = x[2 + idt]; 71 s[3] = x[3 + idt]; 72 s[4] = x[4 + idt]; 73 s[5] = x[5 + idt]; 74 s[6] = x[6 + idt]; 75 s[7] = x[7 + idt]; 76 s[8] = x[8 + idt]; 77 s[9] = x[9 + idt]; 78 s[10] = x[10 + idt]; 79 s[11] = x[11 + idt]; 80 s[12] = x[12 + idt]; 81 s[13] = x[13 + idt]; 82 83 for (m = 0; m < nz; m++) { 84 idx = bs * vi[m]; 85 for (k = 0; k < bs; k++) { 86 xv = x[k + idx]; 87 s[0] -= v[0] * xv; 88 s[1] -= v[1] * xv; 89 s[2] -= v[2] * xv; 90 s[3] -= v[3] * xv; 91 s[4] -= v[4] * xv; 92 s[5] -= v[5] * xv; 93 s[6] -= v[6] * xv; 94 s[7] -= v[7] * xv; 95 s[8] -= v[8] * xv; 96 s[9] -= v[9] * xv; 97 s[10] -= v[10] * xv; 98 s[11] -= v[11] * xv; 99 s[12] -= v[12] * xv; 100 s[13] -= v[13] * xv; 101 v += 14; 102 } 103 } 104 PetscCall(PetscArrayzero(x + idt, bs)); 105 for (k = 0; k < bs; k++) { 106 x[idt] += v[0] * s[k]; 107 x[1 + idt] += v[1] * s[k]; 108 x[2 + idt] += v[2] * s[k]; 109 x[3 + idt] += v[3] * s[k]; 110 x[4 + idt] += v[4] * s[k]; 111 x[5 + idt] += v[5] * s[k]; 112 x[6 + idt] += v[6] * s[k]; 113 x[7 + idt] += v[7] * s[k]; 114 x[8 + idt] += v[8] * s[k]; 115 x[9 + idt] += v[9] * s[k]; 116 x[10 + idt] += v[10] * s[k]; 117 x[11 + idt] += v[11] * s[k]; 118 x[12 + idt] += v[12] * s[k]; 119 x[13 + idt] += v[13] * s[k]; 120 v += 14; 121 } 122 } 123 PetscCall(VecRestoreArrayRead(bb, &b)); 124 PetscCall(VecRestoreArray(xx, &x)); 125 PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n)); 126 PetscFunctionReturn(0); 127 } 128 129 /* Block operations are done by accessing one column at at time */ 130 /* Default MatSolve for block size 13 */ 131 132 PetscErrorCode MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A, Vec bb, Vec xx) { 133 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 134 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi, bs = A->rmap->bs, bs2 = a->bs2; 135 PetscInt i, k, nz, idx, idt, m; 136 const MatScalar *aa = a->a, *v; 137 PetscScalar s[13]; 138 PetscScalar *x, xv; 139 const PetscScalar *b; 140 141 PetscFunctionBegin; 142 PetscCall(VecGetArrayRead(bb, &b)); 143 PetscCall(VecGetArray(xx, &x)); 144 145 /* forward solve the lower triangular */ 146 for (i = 0; i < n; i++) { 147 v = aa + bs2 * ai[i]; 148 vi = aj + ai[i]; 149 nz = ai[i + 1] - ai[i]; 150 idt = bs * i; 151 x[idt] = b[idt]; 152 x[1 + idt] = b[1 + idt]; 153 x[2 + idt] = b[2 + idt]; 154 x[3 + idt] = b[3 + idt]; 155 x[4 + idt] = b[4 + idt]; 156 x[5 + idt] = b[5 + idt]; 157 x[6 + idt] = b[6 + idt]; 158 x[7 + idt] = b[7 + idt]; 159 x[8 + idt] = b[8 + idt]; 160 x[9 + idt] = b[9 + idt]; 161 x[10 + idt] = b[10 + idt]; 162 x[11 + idt] = b[11 + idt]; 163 x[12 + idt] = b[12 + idt]; 164 for (m = 0; m < nz; m++) { 165 idx = bs * vi[m]; 166 for (k = 0; k < bs; k++) { 167 xv = x[k + idx]; 168 x[idt] -= v[0] * xv; 169 x[1 + idt] -= v[1] * xv; 170 x[2 + idt] -= v[2] * xv; 171 x[3 + idt] -= v[3] * xv; 172 x[4 + idt] -= v[4] * xv; 173 x[5 + idt] -= v[5] * xv; 174 x[6 + idt] -= v[6] * xv; 175 x[7 + idt] -= v[7] * xv; 176 x[8 + idt] -= v[8] * xv; 177 x[9 + idt] -= v[9] * xv; 178 x[10 + idt] -= v[10] * xv; 179 x[11 + idt] -= v[11] * xv; 180 x[12 + idt] -= v[12] * xv; 181 v += 13; 182 } 183 } 184 } 185 /* backward solve the upper triangular */ 186 for (i = n - 1; i >= 0; i--) { 187 v = aa + bs2 * (adiag[i + 1] + 1); 188 vi = aj + adiag[i + 1] + 1; 189 nz = adiag[i] - adiag[i + 1] - 1; 190 idt = bs * i; 191 s[0] = x[idt]; 192 s[1] = x[1 + idt]; 193 s[2] = x[2 + idt]; 194 s[3] = x[3 + idt]; 195 s[4] = x[4 + idt]; 196 s[5] = x[5 + idt]; 197 s[6] = x[6 + idt]; 198 s[7] = x[7 + idt]; 199 s[8] = x[8 + idt]; 200 s[9] = x[9 + idt]; 201 s[10] = x[10 + idt]; 202 s[11] = x[11 + idt]; 203 s[12] = x[12 + idt]; 204 205 for (m = 0; m < nz; m++) { 206 idx = bs * vi[m]; 207 for (k = 0; k < bs; k++) { 208 xv = x[k + idx]; 209 s[0] -= v[0] * xv; 210 s[1] -= v[1] * xv; 211 s[2] -= v[2] * xv; 212 s[3] -= v[3] * xv; 213 s[4] -= v[4] * xv; 214 s[5] -= v[5] * xv; 215 s[6] -= v[6] * xv; 216 s[7] -= v[7] * xv; 217 s[8] -= v[8] * xv; 218 s[9] -= v[9] * xv; 219 s[10] -= v[10] * xv; 220 s[11] -= v[11] * xv; 221 s[12] -= v[12] * xv; 222 v += 13; 223 } 224 } 225 PetscCall(PetscArrayzero(x + idt, bs)); 226 for (k = 0; k < bs; k++) { 227 x[idt] += v[0] * s[k]; 228 x[1 + idt] += v[1] * s[k]; 229 x[2 + idt] += v[2] * s[k]; 230 x[3 + idt] += v[3] * s[k]; 231 x[4 + idt] += v[4] * s[k]; 232 x[5 + idt] += v[5] * s[k]; 233 x[6 + idt] += v[6] * s[k]; 234 x[7 + idt] += v[7] * s[k]; 235 x[8 + idt] += v[8] * s[k]; 236 x[9 + idt] += v[9] * s[k]; 237 x[10 + idt] += v[10] * s[k]; 238 x[11 + idt] += v[11] * s[k]; 239 x[12 + idt] += v[12] * s[k]; 240 v += 13; 241 } 242 } 243 PetscCall(VecRestoreArrayRead(bb, &b)); 244 PetscCall(VecRestoreArray(xx, &x)); 245 PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n)); 246 PetscFunctionReturn(0); 247 } 248 249 /* Block operations are done by accessing one column at at time */ 250 /* Default MatSolve for block size 12 */ 251 252 PetscErrorCode MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A, Vec bb, Vec xx) { 253 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 254 const PetscInt n = a->mbs, *ai = a->i, *aj = a->j, *adiag = a->diag, *vi, bs = A->rmap->bs, bs2 = a->bs2; 255 PetscInt i, k, nz, idx, idt, m; 256 const MatScalar *aa = a->a, *v; 257 PetscScalar s[12]; 258 PetscScalar *x, xv; 259 const PetscScalar *b; 260 261 PetscFunctionBegin; 262 PetscCall(VecGetArrayRead(bb, &b)); 263 PetscCall(VecGetArray(xx, &x)); 264 265 /* forward solve the lower triangular */ 266 for (i = 0; i < n; i++) { 267 v = aa + bs2 * ai[i]; 268 vi = aj + ai[i]; 269 nz = ai[i + 1] - ai[i]; 270 idt = bs * i; 271 x[idt] = b[idt]; 272 x[1 + idt] = b[1 + idt]; 273 x[2 + idt] = b[2 + idt]; 274 x[3 + idt] = b[3 + idt]; 275 x[4 + idt] = b[4 + idt]; 276 x[5 + idt] = b[5 + idt]; 277 x[6 + idt] = b[6 + idt]; 278 x[7 + idt] = b[7 + idt]; 279 x[8 + idt] = b[8 + idt]; 280 x[9 + idt] = b[9 + idt]; 281 x[10 + idt] = b[10 + idt]; 282 x[11 + idt] = b[11 + idt]; 283 for (m = 0; m < nz; m++) { 284 idx = bs * vi[m]; 285 for (k = 0; k < bs; k++) { 286 xv = x[k + idx]; 287 x[idt] -= v[0] * xv; 288 x[1 + idt] -= v[1] * xv; 289 x[2 + idt] -= v[2] * xv; 290 x[3 + idt] -= v[3] * xv; 291 x[4 + idt] -= v[4] * xv; 292 x[5 + idt] -= v[5] * xv; 293 x[6 + idt] -= v[6] * xv; 294 x[7 + idt] -= v[7] * xv; 295 x[8 + idt] -= v[8] * xv; 296 x[9 + idt] -= v[9] * xv; 297 x[10 + idt] -= v[10] * xv; 298 x[11 + idt] -= v[11] * xv; 299 v += 12; 300 } 301 } 302 } 303 /* backward solve the upper triangular */ 304 for (i = n - 1; i >= 0; i--) { 305 v = aa + bs2 * (adiag[i + 1] + 1); 306 vi = aj + adiag[i + 1] + 1; 307 nz = adiag[i] - adiag[i + 1] - 1; 308 idt = bs * i; 309 s[0] = x[idt]; 310 s[1] = x[1 + idt]; 311 s[2] = x[2 + idt]; 312 s[3] = x[3 + idt]; 313 s[4] = x[4 + idt]; 314 s[5] = x[5 + idt]; 315 s[6] = x[6 + idt]; 316 s[7] = x[7 + idt]; 317 s[8] = x[8 + idt]; 318 s[9] = x[9 + idt]; 319 s[10] = x[10 + idt]; 320 s[11] = x[11 + idt]; 321 322 for (m = 0; m < nz; m++) { 323 idx = bs * vi[m]; 324 for (k = 0; k < bs; k++) { 325 xv = x[k + idx]; 326 s[0] -= v[0] * xv; 327 s[1] -= v[1] * xv; 328 s[2] -= v[2] * xv; 329 s[3] -= v[3] * xv; 330 s[4] -= v[4] * xv; 331 s[5] -= v[5] * xv; 332 s[6] -= v[6] * xv; 333 s[7] -= v[7] * xv; 334 s[8] -= v[8] * xv; 335 s[9] -= v[9] * xv; 336 s[10] -= v[10] * xv; 337 s[11] -= v[11] * xv; 338 v += 12; 339 } 340 } 341 PetscCall(PetscArrayzero(x + idt, bs)); 342 for (k = 0; k < bs; k++) { 343 x[idt] += v[0] * s[k]; 344 x[1 + idt] += v[1] * s[k]; 345 x[2 + idt] += v[2] * s[k]; 346 x[3 + idt] += v[3] * s[k]; 347 x[4 + idt] += v[4] * s[k]; 348 x[5 + idt] += v[5] * s[k]; 349 x[6 + idt] += v[6] * s[k]; 350 x[7 + idt] += v[7] * s[k]; 351 x[8 + idt] += v[8] * s[k]; 352 x[9 + idt] += v[9] * s[k]; 353 x[10 + idt] += v[10] * s[k]; 354 x[11 + idt] += v[11] * s[k]; 355 v += 12; 356 } 357 } 358 PetscCall(VecRestoreArrayRead(bb, &b)); 359 PetscCall(VecRestoreArray(xx, &x)); 360 PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n)); 361 PetscFunctionReturn(0); 362 } 363