1 #include <../src/mat/impls/baij/seq/baij.h> 2 #include <petsc/private/kernels/blockinvert.h> 3 4 /* Block operations are done by accessing one column at at time */ 5 /* Default MatSolve for block size 14 */ 6 7 PetscErrorCode MatSolve_SeqBAIJ_14_NaturalOrdering(Mat A,Vec bb,Vec xx) 8 { 9 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 10 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 11 PetscInt i,k,nz,idx,idt,m; 12 const MatScalar *aa=a->a,*v; 13 PetscScalar s[14]; 14 PetscScalar *x,xv; 15 const PetscScalar *b; 16 17 PetscFunctionBegin; 18 PetscCall(VecGetArrayRead(bb,&b)); 19 PetscCall(VecGetArray(xx,&x)); 20 21 /* forward solve the lower triangular */ 22 for (i=0; i<n; i++) { 23 v = aa + bs2*ai[i]; 24 vi = aj + ai[i]; 25 nz = ai[i+1] - ai[i]; 26 idt = bs*i; 27 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 28 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 29 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; 30 for (m=0; m<nz; m++) { 31 idx = bs*vi[m]; 32 for (k=0; k<bs; k++) { 33 xv = x[k + idx]; 34 x[idt] -= v[0]*xv; 35 x[1+idt] -= v[1]*xv; 36 x[2+idt] -= v[2]*xv; 37 x[3+idt] -= v[3]*xv; 38 x[4+idt] -= v[4]*xv; 39 x[5+idt] -= v[5]*xv; 40 x[6+idt] -= v[6]*xv; 41 x[7+idt] -= v[7]*xv; 42 x[8+idt] -= v[8]*xv; 43 x[9+idt] -= v[9]*xv; 44 x[10+idt] -= v[10]*xv; 45 x[11+idt] -= v[11]*xv; 46 x[12+idt] -= v[12]*xv; 47 x[13+idt] -= v[13]*xv; 48 v += 14; 49 } 50 } 51 } 52 /* backward solve the upper triangular */ 53 for (i=n-1; i>=0; i--) { 54 v = aa + bs2*(adiag[i+1]+1); 55 vi = aj + adiag[i+1]+1; 56 nz = adiag[i] - adiag[i+1] - 1; 57 idt = bs*i; 58 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 59 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 60 s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; 61 62 for (m=0; m<nz; m++) { 63 idx = bs*vi[m]; 64 for (k=0; k<bs; k++) { 65 xv = x[k + idx]; 66 s[0] -= v[0]*xv; 67 s[1] -= v[1]*xv; 68 s[2] -= v[2]*xv; 69 s[3] -= v[3]*xv; 70 s[4] -= v[4]*xv; 71 s[5] -= v[5]*xv; 72 s[6] -= v[6]*xv; 73 s[7] -= v[7]*xv; 74 s[8] -= v[8]*xv; 75 s[9] -= v[9]*xv; 76 s[10] -= v[10]*xv; 77 s[11] -= v[11]*xv; 78 s[12] -= v[12]*xv; 79 s[13] -= v[13]*xv; 80 v += 14; 81 } 82 } 83 PetscCall(PetscArrayzero(x+idt,bs)); 84 for (k=0; k<bs; k++) { 85 x[idt] += v[0]*s[k]; 86 x[1+idt] += v[1]*s[k]; 87 x[2+idt] += v[2]*s[k]; 88 x[3+idt] += v[3]*s[k]; 89 x[4+idt] += v[4]*s[k]; 90 x[5+idt] += v[5]*s[k]; 91 x[6+idt] += v[6]*s[k]; 92 x[7+idt] += v[7]*s[k]; 93 x[8+idt] += v[8]*s[k]; 94 x[9+idt] += v[9]*s[k]; 95 x[10+idt] += v[10]*s[k]; 96 x[11+idt] += v[11]*s[k]; 97 x[12+idt] += v[12]*s[k]; 98 x[13+idt] += v[13]*s[k]; 99 v += 14; 100 } 101 } 102 PetscCall(VecRestoreArrayRead(bb,&b)); 103 PetscCall(VecRestoreArray(xx,&x)); 104 PetscCall(PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n)); 105 PetscFunctionReturn(0); 106 } 107 108 /* Block operations are done by accessing one column at at time */ 109 /* Default MatSolve for block size 13 */ 110 111 PetscErrorCode MatSolve_SeqBAIJ_13_NaturalOrdering(Mat A,Vec bb,Vec xx) 112 { 113 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 114 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 115 PetscInt i,k,nz,idx,idt,m; 116 const MatScalar *aa=a->a,*v; 117 PetscScalar s[13]; 118 PetscScalar *x,xv; 119 const PetscScalar *b; 120 121 PetscFunctionBegin; 122 PetscCall(VecGetArrayRead(bb,&b)); 123 PetscCall(VecGetArray(xx,&x)); 124 125 /* forward solve the lower triangular */ 126 for (i=0; i<n; i++) { 127 v = aa + bs2*ai[i]; 128 vi = aj + ai[i]; 129 nz = ai[i+1] - ai[i]; 130 idt = bs*i; 131 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 132 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 133 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; 134 for (m=0; m<nz; m++) { 135 idx = bs*vi[m]; 136 for (k=0; k<bs; k++) { 137 xv = x[k + idx]; 138 x[idt] -= v[0]*xv; 139 x[1+idt] -= v[1]*xv; 140 x[2+idt] -= v[2]*xv; 141 x[3+idt] -= v[3]*xv; 142 x[4+idt] -= v[4]*xv; 143 x[5+idt] -= v[5]*xv; 144 x[6+idt] -= v[6]*xv; 145 x[7+idt] -= v[7]*xv; 146 x[8+idt] -= v[8]*xv; 147 x[9+idt] -= v[9]*xv; 148 x[10+idt] -= v[10]*xv; 149 x[11+idt] -= v[11]*xv; 150 x[12+idt] -= v[12]*xv; 151 v += 13; 152 } 153 } 154 } 155 /* backward solve the upper triangular */ 156 for (i=n-1; i>=0; i--) { 157 v = aa + bs2*(adiag[i+1]+1); 158 vi = aj + adiag[i+1]+1; 159 nz = adiag[i] - adiag[i+1] - 1; 160 idt = bs*i; 161 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 162 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 163 s[10] = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; 164 165 for (m=0; m<nz; m++) { 166 idx = bs*vi[m]; 167 for (k=0; k<bs; k++) { 168 xv = x[k + idx]; 169 s[0] -= v[0]*xv; 170 s[1] -= v[1]*xv; 171 s[2] -= v[2]*xv; 172 s[3] -= v[3]*xv; 173 s[4] -= v[4]*xv; 174 s[5] -= v[5]*xv; 175 s[6] -= v[6]*xv; 176 s[7] -= v[7]*xv; 177 s[8] -= v[8]*xv; 178 s[9] -= v[9]*xv; 179 s[10] -= v[10]*xv; 180 s[11] -= v[11]*xv; 181 s[12] -= v[12]*xv; 182 v += 13; 183 } 184 } 185 PetscCall(PetscArrayzero(x+idt,bs)); 186 for (k=0; k<bs; k++) { 187 x[idt] += v[0]*s[k]; 188 x[1+idt] += v[1]*s[k]; 189 x[2+idt] += v[2]*s[k]; 190 x[3+idt] += v[3]*s[k]; 191 x[4+idt] += v[4]*s[k]; 192 x[5+idt] += v[5]*s[k]; 193 x[6+idt] += v[6]*s[k]; 194 x[7+idt] += v[7]*s[k]; 195 x[8+idt] += v[8]*s[k]; 196 x[9+idt] += v[9]*s[k]; 197 x[10+idt] += v[10]*s[k]; 198 x[11+idt] += v[11]*s[k]; 199 x[12+idt] += v[12]*s[k]; 200 v += 13; 201 } 202 } 203 PetscCall(VecRestoreArrayRead(bb,&b)); 204 PetscCall(VecRestoreArray(xx,&x)); 205 PetscCall(PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n)); 206 PetscFunctionReturn(0); 207 } 208 209 /* Block operations are done by accessing one column at at time */ 210 /* Default MatSolve for block size 12 */ 211 212 PetscErrorCode MatSolve_SeqBAIJ_12_NaturalOrdering(Mat A,Vec bb,Vec xx) 213 { 214 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data; 215 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 216 PetscInt i,k,nz,idx,idt,m; 217 const MatScalar *aa=a->a,*v; 218 PetscScalar s[12]; 219 PetscScalar *x,xv; 220 const PetscScalar *b; 221 222 PetscFunctionBegin; 223 PetscCall(VecGetArrayRead(bb,&b)); 224 PetscCall(VecGetArray(xx,&x)); 225 226 /* forward solve the lower triangular */ 227 for (i=0; i<n; i++) { 228 v = aa + bs2*ai[i]; 229 vi = aj + ai[i]; 230 nz = ai[i+1] - ai[i]; 231 idt = bs*i; 232 x[idt] = b[idt]; x[1+idt] = b[1+idt]; x[2+idt] = b[2+idt]; x[3+idt] = b[3+idt]; x[4+idt] = b[4+idt]; 233 x[5+idt] = b[5+idt]; x[6+idt] = b[6+idt]; x[7+idt] = b[7+idt]; x[8+idt] = b[8+idt]; x[9+idt] = b[9+idt]; 234 x[10+idt] = b[10+idt]; x[11+idt] = b[11+idt]; 235 for (m=0; m<nz; m++) { 236 idx = bs*vi[m]; 237 for (k=0; k<bs; k++) { 238 xv = x[k + idx]; 239 x[idt] -= v[0]*xv; 240 x[1+idt] -= v[1]*xv; 241 x[2+idt] -= v[2]*xv; 242 x[3+idt] -= v[3]*xv; 243 x[4+idt] -= v[4]*xv; 244 x[5+idt] -= v[5]*xv; 245 x[6+idt] -= v[6]*xv; 246 x[7+idt] -= v[7]*xv; 247 x[8+idt] -= v[8]*xv; 248 x[9+idt] -= v[9]*xv; 249 x[10+idt] -= v[10]*xv; 250 x[11+idt] -= v[11]*xv; 251 v += 12; 252 } 253 } 254 } 255 /* backward solve the upper triangular */ 256 for (i=n-1; i>=0; i--) { 257 v = aa + bs2*(adiag[i+1]+1); 258 vi = aj + adiag[i+1]+1; 259 nz = adiag[i] - adiag[i+1] - 1; 260 idt = bs*i; 261 s[0] = x[idt]; s[1] = x[1+idt]; s[2] = x[2+idt]; s[3] = x[3+idt]; s[4] = x[4+idt]; 262 s[5] = x[5+idt]; s[6] = x[6+idt]; s[7] = x[7+idt]; s[8] = x[8+idt]; s[9] = x[9+idt]; 263 s[10] = x[10+idt]; s[11] = x[11+idt]; 264 265 for (m=0; m<nz; m++) { 266 idx = bs*vi[m]; 267 for (k=0; k<bs; k++) { 268 xv = x[k + idx]; 269 s[0] -= v[0]*xv; 270 s[1] -= v[1]*xv; 271 s[2] -= v[2]*xv; 272 s[3] -= v[3]*xv; 273 s[4] -= v[4]*xv; 274 s[5] -= v[5]*xv; 275 s[6] -= v[6]*xv; 276 s[7] -= v[7]*xv; 277 s[8] -= v[8]*xv; 278 s[9] -= v[9]*xv; 279 s[10] -= v[10]*xv; 280 s[11] -= v[11]*xv; 281 v += 12; 282 } 283 } 284 PetscCall(PetscArrayzero(x+idt,bs)); 285 for (k=0; k<bs; k++) { 286 x[idt] += v[0]*s[k]; 287 x[1+idt] += v[1]*s[k]; 288 x[2+idt] += v[2]*s[k]; 289 x[3+idt] += v[3]*s[k]; 290 x[4+idt] += v[4]*s[k]; 291 x[5+idt] += v[5]*s[k]; 292 x[6+idt] += v[6]*s[k]; 293 x[7+idt] += v[7]*s[k]; 294 x[8+idt] += v[8]*s[k]; 295 x[9+idt] += v[9]*s[k]; 296 x[10+idt] += v[10]*s[k]; 297 x[11+idt] += v[11]*s[k]; 298 v += 12; 299 } 300 } 301 PetscCall(VecRestoreArrayRead(bb,&b)); 302 PetscCall(VecRestoreArray(xx,&x)); 303 PetscCall(PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n)); 304 PetscFunctionReturn(0); 305 } 306