1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5 { 6 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 7 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 8 PetscInt i,nz,idx,idt,jdx; 9 const MatScalar *aa=a->a,*v; 10 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 11 const PetscScalar *b; 12 13 PetscFunctionBegin; 14 PetscCall(VecGetArrayRead(bb,&b)); 15 PetscCall(VecGetArray(xx,&x)); 16 /* forward solve the lower triangular */ 17 idx = 0; 18 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 19 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 20 x[6] = b[6+idx]; 21 for (i=1; i<n; i++) { 22 v = aa + 49*ai[i]; 23 vi = aj + ai[i]; 24 nz = diag[i] - ai[i]; 25 idx = 7*i; 26 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 27 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 28 s7 = b[6+idx]; 29 while (nz--) { 30 jdx = 7*(*vi++); 31 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 32 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 33 x7 = x[6+jdx]; 34 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 35 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 36 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 37 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 38 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 39 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 40 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 41 v += 49; 42 } 43 x[idx] = s1; 44 x[1+idx] = s2; 45 x[2+idx] = s3; 46 x[3+idx] = s4; 47 x[4+idx] = s5; 48 x[5+idx] = s6; 49 x[6+idx] = s7; 50 } 51 /* backward solve the upper triangular */ 52 for (i=n-1; i>=0; i--) { 53 v = aa + 49*diag[i] + 49; 54 vi = aj + diag[i] + 1; 55 nz = ai[i+1] - diag[i] - 1; 56 idt = 7*i; 57 s1 = x[idt]; s2 = x[1+idt]; 58 s3 = x[2+idt]; s4 = x[3+idt]; 59 s5 = x[4+idt]; s6 = x[5+idt]; 60 s7 = x[6+idt]; 61 while (nz--) { 62 idx = 7*(*vi++); 63 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 64 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 65 x7 = x[6+idx]; 66 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 67 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 68 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 69 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 70 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 71 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 72 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 73 v += 49; 74 } 75 v = aa + 49*diag[i]; 76 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 77 + v[28]*s5 + v[35]*s6 + v[42]*s7; 78 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 79 + v[29]*s5 + v[36]*s6 + v[43]*s7; 80 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 81 + v[30]*s5 + v[37]*s6 + v[44]*s7; 82 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 83 + v[31]*s5 + v[38]*s6 + v[45]*s7; 84 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 85 + v[32]*s5 + v[39]*s6 + v[46]*s7; 86 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 87 + v[33]*s5 + v[40]*s6 + v[47]*s7; 88 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 89 + v[34]*s5 + v[41]*s6 + v[48]*s7; 90 } 91 92 PetscCall(VecRestoreArrayRead(bb,&b)); 93 PetscCall(VecRestoreArray(xx,&x)); 94 PetscCall(PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n)); 95 PetscFunctionReturn(0); 96 } 97 98 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 99 { 100 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data; 101 const PetscInt n =a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 102 PetscInt i,k,nz,idx,jdx,idt; 103 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 104 const MatScalar *aa=a->a,*v; 105 PetscScalar *x; 106 const PetscScalar *b; 107 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 108 109 PetscFunctionBegin; 110 PetscCall(VecGetArrayRead(bb,&b)); 111 PetscCall(VecGetArray(xx,&x)); 112 /* forward solve the lower triangular */ 113 idx = 0; 114 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 115 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 116 for (i=1; i<n; i++) { 117 v = aa + bs2*ai[i]; 118 vi = aj + ai[i]; 119 nz = ai[i+1] - ai[i]; 120 idx = bs*i; 121 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 122 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 123 for (k=0; k<nz; k++) { 124 jdx = bs*vi[k]; 125 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 126 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 127 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 128 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 129 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 130 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 131 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 132 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 133 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 134 v += bs2; 135 } 136 137 x[idx] = s1; 138 x[1+idx] = s2; 139 x[2+idx] = s3; 140 x[3+idx] = s4; 141 x[4+idx] = s5; 142 x[5+idx] = s6; 143 x[6+idx] = s7; 144 } 145 146 /* backward solve the upper triangular */ 147 for (i=n-1; i>=0; i--) { 148 v = aa + bs2*(adiag[i+1]+1); 149 vi = aj + adiag[i+1]+1; 150 nz = adiag[i] - adiag[i+1]-1; 151 idt = bs*i; 152 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 153 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 154 for (k=0; k<nz; k++) { 155 idx = bs*vi[k]; 156 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 157 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 158 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 159 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 160 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 161 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 162 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 163 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 164 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 165 v += bs2; 166 } 167 /* x = inv_diagonal*x */ 168 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 169 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 170 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 171 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 172 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 173 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 174 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 175 } 176 177 PetscCall(VecRestoreArrayRead(bb,&b)); 178 PetscCall(VecRestoreArray(xx,&x)); 179 PetscCall(PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n)); 180 PetscFunctionReturn(0); 181 } 182