1 #include <../src/mat/impls/baij/seq/baij.h> 2 3 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A, Vec bb, Vec xx) 4 { 5 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6 const PetscInt *diag = a->diag, n = a->mbs, *vi, *ai = a->i, *aj = a->j; 7 PetscInt i, nz, idx, idt, oidx; 8 const MatScalar *aa = a->a, *v; 9 PetscScalar s1, s2, s3, s4, x1, x2, x3, x4, *x; 10 11 PetscFunctionBegin; 12 PetscCall(VecCopy(bb, xx)); 13 PetscCall(VecGetArray(xx, &x)); 14 15 /* forward solve the U^T */ 16 idx = 0; 17 for (i = 0; i < n; i++) { 18 v = aa + 16 * diag[i]; 19 /* multiply by the inverse of the block diagonal */ 20 x1 = x[idx]; 21 x2 = x[1 + idx]; 22 x3 = x[2 + idx]; 23 x4 = x[3 + idx]; 24 s1 = v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 25 s2 = v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 26 s3 = v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 27 s4 = v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 28 v += 16; 29 30 vi = aj + diag[i] + 1; 31 nz = ai[i + 1] - diag[i] - 1; 32 while (nz--) { 33 oidx = 4 * (*vi++); 34 x[oidx] -= v[0] * s1 + v[1] * s2 + v[2] * s3 + v[3] * s4; 35 x[oidx + 1] -= v[4] * s1 + v[5] * s2 + v[6] * s3 + v[7] * s4; 36 x[oidx + 2] -= v[8] * s1 + v[9] * s2 + v[10] * s3 + v[11] * s4; 37 x[oidx + 3] -= v[12] * s1 + v[13] * s2 + v[14] * s3 + v[15] * s4; 38 v += 16; 39 } 40 x[idx] = s1; 41 x[1 + idx] = s2; 42 x[2 + idx] = s3; 43 x[3 + idx] = s4; 44 idx += 4; 45 } 46 /* backward solve the L^T */ 47 for (i = n - 1; i >= 0; i--) { 48 v = aa + 16 * diag[i] - 16; 49 vi = aj + diag[i] - 1; 50 nz = diag[i] - ai[i]; 51 idt = 4 * i; 52 s1 = x[idt]; 53 s2 = x[1 + idt]; 54 s3 = x[2 + idt]; 55 s4 = x[3 + idt]; 56 while (nz--) { 57 idx = 4 * (*vi--); 58 x[idx] -= v[0] * s1 + v[1] * s2 + v[2] * s3 + v[3] * s4; 59 x[idx + 1] -= v[4] * s1 + v[5] * s2 + v[6] * s3 + v[7] * s4; 60 x[idx + 2] -= v[8] * s1 + v[9] * s2 + v[10] * s3 + v[11] * s4; 61 x[idx + 3] -= v[12] * s1 + v[13] * s2 + v[14] * s3 + v[15] * s4; 62 v -= 16; 63 } 64 } 65 PetscCall(VecRestoreArray(xx, &x)); 66 PetscCall(PetscLogFlops(2.0 * 16 * (a->nz) - 4.0 * A->cmap->n)); 67 PetscFunctionReturn(PETSC_SUCCESS); 68 } 69 70 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A, Vec bb, Vec xx) 71 { 72 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 73 const PetscInt n = a->mbs, *vi, *ai = a->i, *aj = a->j, *diag = a->diag; 74 PetscInt nz, idx, idt, j, i, oidx; 75 const PetscInt bs = A->rmap->bs, bs2 = a->bs2; 76 const MatScalar *aa = a->a, *v; 77 PetscScalar s1, s2, s3, s4, x1, x2, x3, x4, *x; 78 79 PetscFunctionBegin; 80 PetscCall(VecCopy(bb, xx)); 81 PetscCall(VecGetArray(xx, &x)); 82 83 /* forward solve the U^T */ 84 idx = 0; 85 for (i = 0; i < n; i++) { 86 v = aa + bs2 * diag[i]; 87 /* multiply by the inverse of the block diagonal */ 88 x1 = x[idx]; 89 x2 = x[1 + idx]; 90 x3 = x[2 + idx]; 91 x4 = x[3 + idx]; 92 s1 = v[0] * x1 + v[1] * x2 + v[2] * x3 + v[3] * x4; 93 s2 = v[4] * x1 + v[5] * x2 + v[6] * x3 + v[7] * x4; 94 s3 = v[8] * x1 + v[9] * x2 + v[10] * x3 + v[11] * x4; 95 s4 = v[12] * x1 + v[13] * x2 + v[14] * x3 + v[15] * x4; 96 v -= bs2; 97 98 vi = aj + diag[i] - 1; 99 nz = diag[i] - diag[i + 1] - 1; 100 for (j = 0; j > -nz; j--) { 101 oidx = bs * vi[j]; 102 x[oidx] -= v[0] * s1 + v[1] * s2 + v[2] * s3 + v[3] * s4; 103 x[oidx + 1] -= v[4] * s1 + v[5] * s2 + v[6] * s3 + v[7] * s4; 104 x[oidx + 2] -= v[8] * s1 + v[9] * s2 + v[10] * s3 + v[11] * s4; 105 x[oidx + 3] -= v[12] * s1 + v[13] * s2 + v[14] * s3 + v[15] * s4; 106 v -= bs2; 107 } 108 x[idx] = s1; 109 x[1 + idx] = s2; 110 x[2 + idx] = s3; 111 x[3 + idx] = s4; 112 idx += bs; 113 } 114 /* backward solve the L^T */ 115 for (i = n - 1; i >= 0; i--) { 116 v = aa + bs2 * ai[i]; 117 vi = aj + ai[i]; 118 nz = ai[i + 1] - ai[i]; 119 idt = bs * i; 120 s1 = x[idt]; 121 s2 = x[1 + idt]; 122 s3 = x[2 + idt]; 123 s4 = x[3 + idt]; 124 for (j = 0; j < nz; j++) { 125 idx = bs * vi[j]; 126 x[idx] -= v[0] * s1 + v[1] * s2 + v[2] * s3 + v[3] * s4; 127 x[idx + 1] -= v[4] * s1 + v[5] * s2 + v[6] * s3 + v[7] * s4; 128 x[idx + 2] -= v[8] * s1 + v[9] * s2 + v[10] * s3 + v[11] * s4; 129 x[idx + 3] -= v[12] * s1 + v[13] * s2 + v[14] * s3 + v[15] * s4; 130 v += bs2; 131 } 132 } 133 PetscCall(VecRestoreArray(xx, &x)); 134 PetscCall(PetscLogFlops(2.0 * bs2 * (a->nz) - bs * A->cmap->n)); 135 PetscFunctionReturn(PETSC_SUCCESS); 136 } 137