1 /* 2 Factorization code for BAIJ format. 3 */ 4 5 #include "src/mat/impls/baij/seq/baij.h" 6 #include "src/inline/ilu.h" 7 #include "src/inline/dot.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 11 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 12 { 13 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 14 PetscErrorCode ierr; 15 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 16 int *diag = a->diag; 17 MatScalar *aa=a->a,*v; 18 PetscScalar s1,*x,*b; 19 20 PetscFunctionBegin; 21 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 22 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 23 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 24 25 /* forward solve the U^T */ 26 for (i=0; i<n; i++) { 27 28 v = aa + diag[i]; 29 /* multiply by the inverse of the block diagonal */ 30 s1 = (*v++)*x[i]; 31 vi = aj + diag[i] + 1; 32 nz = ai[i+1] - diag[i] - 1; 33 while (nz--) { 34 x[*vi++] -= (*v++)*s1; 35 } 36 x[i] = s1; 37 } 38 /* backward solve the L^T */ 39 for (i=n-1; i>=0; i--){ 40 v = aa + diag[i] - 1; 41 vi = aj + diag[i] - 1; 42 nz = diag[i] - ai[i]; 43 s1 = x[i]; 44 while (nz--) { 45 x[*vi--] -= (*v--)*s1; 46 } 47 } 48 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 49 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 50 PetscLogFlops(2*(a->nz) - A->n); 51 PetscFunctionReturn(0); 52 } 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 56 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 57 { 58 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 59 PetscErrorCode ierr; 60 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 61 int *diag = a->diag,oidx; 62 MatScalar *aa=a->a,*v; 63 PetscScalar s1,s2,x1,x2; 64 PetscScalar *x,*b; 65 66 PetscFunctionBegin; 67 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 68 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 69 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 70 71 /* forward solve the U^T */ 72 idx = 0; 73 for (i=0; i<n; i++) { 74 75 v = aa + 4*diag[i]; 76 /* multiply by the inverse of the block diagonal */ 77 x1 = x[idx]; x2 = x[1+idx]; 78 s1 = v[0]*x1 + v[1]*x2; 79 s2 = v[2]*x1 + v[3]*x2; 80 v += 4; 81 82 vi = aj + diag[i] + 1; 83 nz = ai[i+1] - diag[i] - 1; 84 while (nz--) { 85 oidx = 2*(*vi++); 86 x[oidx] -= v[0]*s1 + v[1]*s2; 87 x[oidx+1] -= v[2]*s1 + v[3]*s2; 88 v += 4; 89 } 90 x[idx] = s1;x[1+idx] = s2; 91 idx += 2; 92 } 93 /* backward solve the L^T */ 94 for (i=n-1; i>=0; i--){ 95 v = aa + 4*diag[i] - 4; 96 vi = aj + diag[i] - 1; 97 nz = diag[i] - ai[i]; 98 idt = 2*i; 99 s1 = x[idt]; s2 = x[1+idt]; 100 while (nz--) { 101 idx = 2*(*vi--); 102 x[idx] -= v[0]*s1 + v[1]*s2; 103 x[idx+1] -= v[2]*s1 + v[3]*s2; 104 v -= 4; 105 } 106 } 107 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 108 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 109 PetscLogFlops(2*4*(a->nz) - 2*A->n); 110 PetscFunctionReturn(0); 111 } 112 113 #undef __FUNCT__ 114 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 115 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 116 { 117 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 118 PetscErrorCode ierr; 119 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 120 int *diag = a->diag,oidx; 121 MatScalar *aa=a->a,*v; 122 PetscScalar s1,s2,s3,x1,x2,x3; 123 PetscScalar *x,*b; 124 125 PetscFunctionBegin; 126 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 127 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 128 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 129 130 /* forward solve the U^T */ 131 idx = 0; 132 for (i=0; i<n; i++) { 133 134 v = aa + 9*diag[i]; 135 /* multiply by the inverse of the block diagonal */ 136 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 137 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 138 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 139 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 140 v += 9; 141 142 vi = aj + diag[i] + 1; 143 nz = ai[i+1] - diag[i] - 1; 144 while (nz--) { 145 oidx = 3*(*vi++); 146 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 147 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 148 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 149 v += 9; 150 } 151 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 152 idx += 3; 153 } 154 /* backward solve the L^T */ 155 for (i=n-1; i>=0; i--){ 156 v = aa + 9*diag[i] - 9; 157 vi = aj + diag[i] - 1; 158 nz = diag[i] - ai[i]; 159 idt = 3*i; 160 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 161 while (nz--) { 162 idx = 3*(*vi--); 163 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 164 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 165 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 166 v -= 9; 167 } 168 } 169 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 170 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 171 PetscLogFlops(2*9*(a->nz) - 3*A->n); 172 PetscFunctionReturn(0); 173 } 174 175 #undef __FUNCT__ 176 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 177 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 178 { 179 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 180 PetscErrorCode ierr; 181 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 182 int *diag = a->diag,oidx; 183 MatScalar *aa=a->a,*v; 184 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 185 PetscScalar *x,*b; 186 187 PetscFunctionBegin; 188 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 189 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 190 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 191 192 /* forward solve the U^T */ 193 idx = 0; 194 for (i=0; i<n; i++) { 195 196 v = aa + 16*diag[i]; 197 /* multiply by the inverse of the block diagonal */ 198 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 199 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 200 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 201 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 202 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 203 v += 16; 204 205 vi = aj + diag[i] + 1; 206 nz = ai[i+1] - diag[i] - 1; 207 while (nz--) { 208 oidx = 4*(*vi++); 209 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 210 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 211 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 212 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 213 v += 16; 214 } 215 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 216 idx += 4; 217 } 218 /* backward solve the L^T */ 219 for (i=n-1; i>=0; i--){ 220 v = aa + 16*diag[i] - 16; 221 vi = aj + diag[i] - 1; 222 nz = diag[i] - ai[i]; 223 idt = 4*i; 224 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 225 while (nz--) { 226 idx = 4*(*vi--); 227 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 228 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 229 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 230 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 231 v -= 16; 232 } 233 } 234 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 235 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 236 PetscLogFlops(2*16*(a->nz) - 4*A->n); 237 PetscFunctionReturn(0); 238 } 239 240 #undef __FUNCT__ 241 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 242 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 243 { 244 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 245 PetscErrorCode ierr; 246 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 247 int *diag = a->diag,oidx; 248 MatScalar *aa=a->a,*v; 249 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 250 PetscScalar *x,*b; 251 252 PetscFunctionBegin; 253 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 254 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 255 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 256 257 /* forward solve the U^T */ 258 idx = 0; 259 for (i=0; i<n; i++) { 260 261 v = aa + 25*diag[i]; 262 /* multiply by the inverse of the block diagonal */ 263 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 264 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 265 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 266 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 267 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 268 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 269 v += 25; 270 271 vi = aj + diag[i] + 1; 272 nz = ai[i+1] - diag[i] - 1; 273 while (nz--) { 274 oidx = 5*(*vi++); 275 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 276 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 277 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 278 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 279 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 280 v += 25; 281 } 282 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 283 idx += 5; 284 } 285 /* backward solve the L^T */ 286 for (i=n-1; i>=0; i--){ 287 v = aa + 25*diag[i] - 25; 288 vi = aj + diag[i] - 1; 289 nz = diag[i] - ai[i]; 290 idt = 5*i; 291 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 292 while (nz--) { 293 idx = 5*(*vi--); 294 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 295 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 296 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 297 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 298 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 299 v -= 25; 300 } 301 } 302 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 303 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 304 PetscLogFlops(2*25*(a->nz) - 5*A->n); 305 PetscFunctionReturn(0); 306 } 307 308 #undef __FUNCT__ 309 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 310 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 311 { 312 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 313 PetscErrorCode ierr; 314 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 315 int *diag = a->diag,oidx; 316 MatScalar *aa=a->a,*v; 317 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 318 PetscScalar *x,*b; 319 320 PetscFunctionBegin; 321 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 322 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 323 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 324 325 /* forward solve the U^T */ 326 idx = 0; 327 for (i=0; i<n; i++) { 328 329 v = aa + 36*diag[i]; 330 /* multiply by the inverse of the block diagonal */ 331 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 332 x6 = x[5+idx]; 333 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 334 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 335 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 336 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 337 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 338 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 339 v += 36; 340 341 vi = aj + diag[i] + 1; 342 nz = ai[i+1] - diag[i] - 1; 343 while (nz--) { 344 oidx = 6*(*vi++); 345 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 346 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 347 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 348 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 349 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 350 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 351 v += 36; 352 } 353 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 354 x[5+idx] = s6; 355 idx += 6; 356 } 357 /* backward solve the L^T */ 358 for (i=n-1; i>=0; i--){ 359 v = aa + 36*diag[i] - 36; 360 vi = aj + diag[i] - 1; 361 nz = diag[i] - ai[i]; 362 idt = 6*i; 363 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 364 s6 = x[5+idt]; 365 while (nz--) { 366 idx = 6*(*vi--); 367 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 368 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 369 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 370 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 371 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 372 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 373 v -= 36; 374 } 375 } 376 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 377 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 378 PetscLogFlops(2*36*(a->nz) - 6*A->n); 379 PetscFunctionReturn(0); 380 } 381 382 #undef __FUNCT__ 383 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 384 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 385 { 386 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 387 PetscErrorCode ierr; 388 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 389 int *diag = a->diag,oidx; 390 MatScalar *aa=a->a,*v; 391 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 392 PetscScalar *x,*b; 393 394 PetscFunctionBegin; 395 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 396 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 397 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 398 399 /* forward solve the U^T */ 400 idx = 0; 401 for (i=0; i<n; i++) { 402 403 v = aa + 49*diag[i]; 404 /* multiply by the inverse of the block diagonal */ 405 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 406 x6 = x[5+idx]; x7 = x[6+idx]; 407 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 408 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 409 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 410 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 411 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 412 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 413 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 414 v += 49; 415 416 vi = aj + diag[i] + 1; 417 nz = ai[i+1] - diag[i] - 1; 418 while (nz--) { 419 oidx = 7*(*vi++); 420 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 421 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 422 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 423 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 424 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 425 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 426 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 427 v += 49; 428 } 429 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 430 x[5+idx] = s6;x[6+idx] = s7; 431 idx += 7; 432 } 433 /* backward solve the L^T */ 434 for (i=n-1; i>=0; i--){ 435 v = aa + 49*diag[i] - 49; 436 vi = aj + diag[i] - 1; 437 nz = diag[i] - ai[i]; 438 idt = 7*i; 439 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 440 s6 = x[5+idt];s7 = x[6+idt]; 441 while (nz--) { 442 idx = 7*(*vi--); 443 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 444 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 445 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 446 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 447 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 448 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 449 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 450 v -= 49; 451 } 452 } 453 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 454 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 455 PetscLogFlops(2*49*(a->nz) - 7*A->n); 456 PetscFunctionReturn(0); 457 } 458 459 /*---------------------------------------------------------------------------------------------*/ 460 #undef __FUNCT__ 461 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 462 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 463 { 464 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 465 IS iscol=a->col,isrow=a->row; 466 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 467 int *diag = a->diag; 468 MatScalar *aa=a->a,*v; 469 PetscScalar s1,*x,*b,*t; 470 471 PetscFunctionBegin; 472 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 473 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 474 t = a->solve_work; 475 476 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 477 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 478 479 /* copy the b into temp work space according to permutation */ 480 for (i=0; i<n; i++) { 481 t[i] = b[c[i]]; 482 } 483 484 /* forward solve the U^T */ 485 for (i=0; i<n; i++) { 486 487 v = aa + diag[i]; 488 /* multiply by the inverse of the block diagonal */ 489 s1 = (*v++)*t[i]; 490 vi = aj + diag[i] + 1; 491 nz = ai[i+1] - diag[i] - 1; 492 while (nz--) { 493 t[*vi++] -= (*v++)*s1; 494 } 495 t[i] = s1; 496 } 497 /* backward solve the L^T */ 498 for (i=n-1; i>=0; i--){ 499 v = aa + diag[i] - 1; 500 vi = aj + diag[i] - 1; 501 nz = diag[i] - ai[i]; 502 s1 = t[i]; 503 while (nz--) { 504 t[*vi--] -= (*v--)*s1; 505 } 506 } 507 508 /* copy t into x according to permutation */ 509 for (i=0; i<n; i++) { 510 x[r[i]] = t[i]; 511 } 512 513 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 514 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 515 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 516 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 517 PetscLogFlops(2*(a->nz) - A->n); 518 PetscFunctionReturn(0); 519 } 520 521 #undef __FUNCT__ 522 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 523 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 524 { 525 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 526 IS iscol=a->col,isrow=a->row; 527 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 528 int *diag = a->diag,ii,ic,ir,oidx; 529 MatScalar *aa=a->a,*v; 530 PetscScalar s1,s2,x1,x2; 531 PetscScalar *x,*b,*t; 532 533 PetscFunctionBegin; 534 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 535 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 536 t = a->solve_work; 537 538 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 539 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 540 541 /* copy the b into temp work space according to permutation */ 542 ii = 0; 543 for (i=0; i<n; i++) { 544 ic = 2*c[i]; 545 t[ii] = b[ic]; 546 t[ii+1] = b[ic+1]; 547 ii += 2; 548 } 549 550 /* forward solve the U^T */ 551 idx = 0; 552 for (i=0; i<n; i++) { 553 554 v = aa + 4*diag[i]; 555 /* multiply by the inverse of the block diagonal */ 556 x1 = t[idx]; x2 = t[1+idx]; 557 s1 = v[0]*x1 + v[1]*x2; 558 s2 = v[2]*x1 + v[3]*x2; 559 v += 4; 560 561 vi = aj + diag[i] + 1; 562 nz = ai[i+1] - diag[i] - 1; 563 while (nz--) { 564 oidx = 2*(*vi++); 565 t[oidx] -= v[0]*s1 + v[1]*s2; 566 t[oidx+1] -= v[2]*s1 + v[3]*s2; 567 v += 4; 568 } 569 t[idx] = s1;t[1+idx] = s2; 570 idx += 2; 571 } 572 /* backward solve the L^T */ 573 for (i=n-1; i>=0; i--){ 574 v = aa + 4*diag[i] - 4; 575 vi = aj + diag[i] - 1; 576 nz = diag[i] - ai[i]; 577 idt = 2*i; 578 s1 = t[idt]; s2 = t[1+idt]; 579 while (nz--) { 580 idx = 2*(*vi--); 581 t[idx] -= v[0]*s1 + v[1]*s2; 582 t[idx+1] -= v[2]*s1 + v[3]*s2; 583 v -= 4; 584 } 585 } 586 587 /* copy t into x according to permutation */ 588 ii = 0; 589 for (i=0; i<n; i++) { 590 ir = 2*r[i]; 591 x[ir] = t[ii]; 592 x[ir+1] = t[ii+1]; 593 ii += 2; 594 } 595 596 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 597 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 598 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 599 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 600 PetscLogFlops(2*4*(a->nz) - 2*A->n); 601 PetscFunctionReturn(0); 602 } 603 604 #undef __FUNCT__ 605 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 606 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 607 { 608 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 609 IS iscol=a->col,isrow=a->row; 610 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 611 int *diag = a->diag,ii,ic,ir,oidx; 612 MatScalar *aa=a->a,*v; 613 PetscScalar s1,s2,s3,x1,x2,x3; 614 PetscScalar *x,*b,*t; 615 616 PetscFunctionBegin; 617 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 618 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 619 t = a->solve_work; 620 621 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 622 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 623 624 /* copy the b into temp work space according to permutation */ 625 ii = 0; 626 for (i=0; i<n; i++) { 627 ic = 3*c[i]; 628 t[ii] = b[ic]; 629 t[ii+1] = b[ic+1]; 630 t[ii+2] = b[ic+2]; 631 ii += 3; 632 } 633 634 /* forward solve the U^T */ 635 idx = 0; 636 for (i=0; i<n; i++) { 637 638 v = aa + 9*diag[i]; 639 /* multiply by the inverse of the block diagonal */ 640 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 641 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 642 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 643 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 644 v += 9; 645 646 vi = aj + diag[i] + 1; 647 nz = ai[i+1] - diag[i] - 1; 648 while (nz--) { 649 oidx = 3*(*vi++); 650 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 651 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 652 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 653 v += 9; 654 } 655 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 656 idx += 3; 657 } 658 /* backward solve the L^T */ 659 for (i=n-1; i>=0; i--){ 660 v = aa + 9*diag[i] - 9; 661 vi = aj + diag[i] - 1; 662 nz = diag[i] - ai[i]; 663 idt = 3*i; 664 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 665 while (nz--) { 666 idx = 3*(*vi--); 667 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 668 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 669 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 670 v -= 9; 671 } 672 } 673 674 /* copy t into x according to permutation */ 675 ii = 0; 676 for (i=0; i<n; i++) { 677 ir = 3*r[i]; 678 x[ir] = t[ii]; 679 x[ir+1] = t[ii+1]; 680 x[ir+2] = t[ii+2]; 681 ii += 3; 682 } 683 684 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 685 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 686 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 687 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 688 PetscLogFlops(2*9*(a->nz) - 3*A->n); 689 PetscFunctionReturn(0); 690 } 691 692 #undef __FUNCT__ 693 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 694 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 695 { 696 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 697 IS iscol=a->col,isrow=a->row; 698 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 699 int *diag = a->diag,ii,ic,ir,oidx; 700 MatScalar *aa=a->a,*v; 701 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 702 PetscScalar *x,*b,*t; 703 704 PetscFunctionBegin; 705 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 706 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 707 t = a->solve_work; 708 709 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 710 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 711 712 /* copy the b into temp work space according to permutation */ 713 ii = 0; 714 for (i=0; i<n; i++) { 715 ic = 4*c[i]; 716 t[ii] = b[ic]; 717 t[ii+1] = b[ic+1]; 718 t[ii+2] = b[ic+2]; 719 t[ii+3] = b[ic+3]; 720 ii += 4; 721 } 722 723 /* forward solve the U^T */ 724 idx = 0; 725 for (i=0; i<n; i++) { 726 727 v = aa + 16*diag[i]; 728 /* multiply by the inverse of the block diagonal */ 729 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 730 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 731 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 732 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 733 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 734 v += 16; 735 736 vi = aj + diag[i] + 1; 737 nz = ai[i+1] - diag[i] - 1; 738 while (nz--) { 739 oidx = 4*(*vi++); 740 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 741 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 742 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 743 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 744 v += 16; 745 } 746 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 747 idx += 4; 748 } 749 /* backward solve the L^T */ 750 for (i=n-1; i>=0; i--){ 751 v = aa + 16*diag[i] - 16; 752 vi = aj + diag[i] - 1; 753 nz = diag[i] - ai[i]; 754 idt = 4*i; 755 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 756 while (nz--) { 757 idx = 4*(*vi--); 758 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 759 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 760 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 761 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 762 v -= 16; 763 } 764 } 765 766 /* copy t into x according to permutation */ 767 ii = 0; 768 for (i=0; i<n; i++) { 769 ir = 4*r[i]; 770 x[ir] = t[ii]; 771 x[ir+1] = t[ii+1]; 772 x[ir+2] = t[ii+2]; 773 x[ir+3] = t[ii+3]; 774 ii += 4; 775 } 776 777 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 778 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 779 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 780 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 781 PetscLogFlops(2*16*(a->nz) - 4*A->n); 782 PetscFunctionReturn(0); 783 } 784 785 #undef __FUNCT__ 786 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 787 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 788 { 789 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 790 IS iscol=a->col,isrow=a->row; 791 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 792 int *diag = a->diag,ii,ic,ir,oidx; 793 MatScalar *aa=a->a,*v; 794 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 795 PetscScalar *x,*b,*t; 796 797 PetscFunctionBegin; 798 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 799 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 800 t = a->solve_work; 801 802 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 803 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 804 805 /* copy the b into temp work space according to permutation */ 806 ii = 0; 807 for (i=0; i<n; i++) { 808 ic = 5*c[i]; 809 t[ii] = b[ic]; 810 t[ii+1] = b[ic+1]; 811 t[ii+2] = b[ic+2]; 812 t[ii+3] = b[ic+3]; 813 t[ii+4] = b[ic+4]; 814 ii += 5; 815 } 816 817 /* forward solve the U^T */ 818 idx = 0; 819 for (i=0; i<n; i++) { 820 821 v = aa + 25*diag[i]; 822 /* multiply by the inverse of the block diagonal */ 823 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 824 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 825 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 826 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 827 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 828 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 829 v += 25; 830 831 vi = aj + diag[i] + 1; 832 nz = ai[i+1] - diag[i] - 1; 833 while (nz--) { 834 oidx = 5*(*vi++); 835 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 836 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 837 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 838 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 839 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 840 v += 25; 841 } 842 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 843 idx += 5; 844 } 845 /* backward solve the L^T */ 846 for (i=n-1; i>=0; i--){ 847 v = aa + 25*diag[i] - 25; 848 vi = aj + diag[i] - 1; 849 nz = diag[i] - ai[i]; 850 idt = 5*i; 851 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 852 while (nz--) { 853 idx = 5*(*vi--); 854 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 855 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 856 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 857 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 858 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 859 v -= 25; 860 } 861 } 862 863 /* copy t into x according to permutation */ 864 ii = 0; 865 for (i=0; i<n; i++) { 866 ir = 5*r[i]; 867 x[ir] = t[ii]; 868 x[ir+1] = t[ii+1]; 869 x[ir+2] = t[ii+2]; 870 x[ir+3] = t[ii+3]; 871 x[ir+4] = t[ii+4]; 872 ii += 5; 873 } 874 875 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 876 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 877 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 878 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 879 PetscLogFlops(2*25*(a->nz) - 5*A->n); 880 PetscFunctionReturn(0); 881 } 882 883 #undef __FUNCT__ 884 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 885 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 886 { 887 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 888 IS iscol=a->col,isrow=a->row; 889 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 890 int *diag = a->diag,ii,ic,ir,oidx; 891 MatScalar *aa=a->a,*v; 892 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 893 PetscScalar *x,*b,*t; 894 895 PetscFunctionBegin; 896 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 897 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 898 t = a->solve_work; 899 900 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 901 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 902 903 /* copy the b into temp work space according to permutation */ 904 ii = 0; 905 for (i=0; i<n; i++) { 906 ic = 6*c[i]; 907 t[ii] = b[ic]; 908 t[ii+1] = b[ic+1]; 909 t[ii+2] = b[ic+2]; 910 t[ii+3] = b[ic+3]; 911 t[ii+4] = b[ic+4]; 912 t[ii+5] = b[ic+5]; 913 ii += 6; 914 } 915 916 /* forward solve the U^T */ 917 idx = 0; 918 for (i=0; i<n; i++) { 919 920 v = aa + 36*diag[i]; 921 /* multiply by the inverse of the block diagonal */ 922 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 923 x6 = t[5+idx]; 924 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 925 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 926 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 927 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 928 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 929 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 930 v += 36; 931 932 vi = aj + diag[i] + 1; 933 nz = ai[i+1] - diag[i] - 1; 934 while (nz--) { 935 oidx = 6*(*vi++); 936 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 937 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 938 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 939 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 940 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 941 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 942 v += 36; 943 } 944 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 945 t[5+idx] = s6; 946 idx += 6; 947 } 948 /* backward solve the L^T */ 949 for (i=n-1; i>=0; i--){ 950 v = aa + 36*diag[i] - 36; 951 vi = aj + diag[i] - 1; 952 nz = diag[i] - ai[i]; 953 idt = 6*i; 954 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 955 s6 = t[5+idt]; 956 while (nz--) { 957 idx = 6*(*vi--); 958 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 959 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 960 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 961 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 962 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 963 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 964 v -= 36; 965 } 966 } 967 968 /* copy t into x according to permutation */ 969 ii = 0; 970 for (i=0; i<n; i++) { 971 ir = 6*r[i]; 972 x[ir] = t[ii]; 973 x[ir+1] = t[ii+1]; 974 x[ir+2] = t[ii+2]; 975 x[ir+3] = t[ii+3]; 976 x[ir+4] = t[ii+4]; 977 x[ir+5] = t[ii+5]; 978 ii += 6; 979 } 980 981 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 982 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 983 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 984 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 985 PetscLogFlops(2*36*(a->nz) - 6*A->n); 986 PetscFunctionReturn(0); 987 } 988 989 #undef __FUNCT__ 990 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 991 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 992 { 993 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 994 IS iscol=a->col,isrow=a->row; 995 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 996 int *diag = a->diag,ii,ic,ir,oidx; 997 MatScalar *aa=a->a,*v; 998 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 999 PetscScalar *x,*b,*t; 1000 1001 PetscFunctionBegin; 1002 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1003 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1004 t = a->solve_work; 1005 1006 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1007 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1008 1009 /* copy the b into temp work space according to permutation */ 1010 ii = 0; 1011 for (i=0; i<n; i++) { 1012 ic = 7*c[i]; 1013 t[ii] = b[ic]; 1014 t[ii+1] = b[ic+1]; 1015 t[ii+2] = b[ic+2]; 1016 t[ii+3] = b[ic+3]; 1017 t[ii+4] = b[ic+4]; 1018 t[ii+5] = b[ic+5]; 1019 t[ii+6] = b[ic+6]; 1020 ii += 7; 1021 } 1022 1023 /* forward solve the U^T */ 1024 idx = 0; 1025 for (i=0; i<n; i++) { 1026 1027 v = aa + 49*diag[i]; 1028 /* multiply by the inverse of the block diagonal */ 1029 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1030 x6 = t[5+idx]; x7 = t[6+idx]; 1031 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1032 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1033 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1034 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1035 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1036 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1037 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1038 v += 49; 1039 1040 vi = aj + diag[i] + 1; 1041 nz = ai[i+1] - diag[i] - 1; 1042 while (nz--) { 1043 oidx = 7*(*vi++); 1044 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1045 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1046 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1047 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1048 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1049 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1050 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1051 v += 49; 1052 } 1053 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1054 t[5+idx] = s6;t[6+idx] = s7; 1055 idx += 7; 1056 } 1057 /* backward solve the L^T */ 1058 for (i=n-1; i>=0; i--){ 1059 v = aa + 49*diag[i] - 49; 1060 vi = aj + diag[i] - 1; 1061 nz = diag[i] - ai[i]; 1062 idt = 7*i; 1063 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1064 s6 = t[5+idt];s7 = t[6+idt]; 1065 while (nz--) { 1066 idx = 7*(*vi--); 1067 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1068 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1069 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1070 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1071 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1072 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1073 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1074 v -= 49; 1075 } 1076 } 1077 1078 /* copy t into x according to permutation */ 1079 ii = 0; 1080 for (i=0; i<n; i++) { 1081 ir = 7*r[i]; 1082 x[ir] = t[ii]; 1083 x[ir+1] = t[ii+1]; 1084 x[ir+2] = t[ii+2]; 1085 x[ir+3] = t[ii+3]; 1086 x[ir+4] = t[ii+4]; 1087 x[ir+5] = t[ii+5]; 1088 x[ir+6] = t[ii+6]; 1089 ii += 7; 1090 } 1091 1092 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1093 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1094 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1095 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1096 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1097 PetscFunctionReturn(0); 1098 } 1099 1100 /* ----------------------------------------------------------- */ 1101 #undef __FUNCT__ 1102 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1103 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1104 { 1105 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1106 IS iscol=a->col,isrow=a->row; 1107 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1108 int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 1109 MatScalar *aa=a->a,*v; 1110 PetscScalar *x,*b,*s,*t,*ls; 1111 1112 PetscFunctionBegin; 1113 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1114 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1115 t = a->solve_work; 1116 1117 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1118 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1119 1120 /* forward solve the lower triangular */ 1121 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1122 for (i=1; i<n; i++) { 1123 v = aa + bs2*ai[i]; 1124 vi = aj + ai[i]; 1125 nz = a->diag[i] - ai[i]; 1126 s = t + bs*i; 1127 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1128 while (nz--) { 1129 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1130 v += bs2; 1131 } 1132 } 1133 /* backward solve the upper triangular */ 1134 ls = a->solve_work + A->n; 1135 for (i=n-1; i>=0; i--){ 1136 v = aa + bs2*(a->diag[i] + 1); 1137 vi = aj + a->diag[i] + 1; 1138 nz = ai[i+1] - a->diag[i] - 1; 1139 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1140 while (nz--) { 1141 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1142 v += bs2; 1143 } 1144 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1145 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1146 } 1147 1148 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1149 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1150 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1151 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1152 PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 1153 PetscFunctionReturn(0); 1154 } 1155 1156 #undef __FUNCT__ 1157 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1158 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1159 { 1160 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1161 IS iscol=a->col,isrow=a->row; 1162 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1163 int *diag = a->diag; 1164 MatScalar *aa=a->a,*v; 1165 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1166 PetscScalar *x,*b,*t; 1167 1168 PetscFunctionBegin; 1169 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1170 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1171 t = a->solve_work; 1172 1173 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1174 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1175 1176 /* forward solve the lower triangular */ 1177 idx = 7*(*r++); 1178 t[0] = b[idx]; t[1] = b[1+idx]; 1179 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1180 t[5] = b[5+idx]; t[6] = b[6+idx]; 1181 1182 for (i=1; i<n; i++) { 1183 v = aa + 49*ai[i]; 1184 vi = aj + ai[i]; 1185 nz = diag[i] - ai[i]; 1186 idx = 7*(*r++); 1187 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1188 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1189 while (nz--) { 1190 idx = 7*(*vi++); 1191 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1192 x4 = t[3+idx];x5 = t[4+idx]; 1193 x6 = t[5+idx];x7 = t[6+idx]; 1194 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1195 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1196 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1197 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1198 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1199 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1200 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1201 v += 49; 1202 } 1203 idx = 7*i; 1204 t[idx] = s1;t[1+idx] = s2; 1205 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1206 t[5+idx] = s6;t[6+idx] = s7; 1207 } 1208 /* backward solve the upper triangular */ 1209 for (i=n-1; i>=0; i--){ 1210 v = aa + 49*diag[i] + 49; 1211 vi = aj + diag[i] + 1; 1212 nz = ai[i+1] - diag[i] - 1; 1213 idt = 7*i; 1214 s1 = t[idt]; s2 = t[1+idt]; 1215 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1216 s6 = t[5+idt];s7 = t[6+idt]; 1217 while (nz--) { 1218 idx = 7*(*vi++); 1219 x1 = t[idx]; x2 = t[1+idx]; 1220 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1221 x6 = t[5+idx]; x7 = t[6+idx]; 1222 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1223 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1224 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1225 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1226 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1227 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1228 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1229 v += 49; 1230 } 1231 idc = 7*(*c--); 1232 v = aa + 49*diag[i]; 1233 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1234 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1235 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1236 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1237 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1238 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1239 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1240 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1241 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1242 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1243 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1244 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1245 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1246 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1247 } 1248 1249 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1250 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1251 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1252 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1253 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1254 PetscFunctionReturn(0); 1255 } 1256 1257 #undef __FUNCT__ 1258 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1259 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1260 { 1261 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1262 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1263 PetscErrorCode ierr; 1264 int *diag = a->diag,jdx; 1265 MatScalar *aa=a->a,*v; 1266 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1267 1268 PetscFunctionBegin; 1269 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1270 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1271 /* forward solve the lower triangular */ 1272 idx = 0; 1273 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1274 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1275 x[6] = b[6+idx]; 1276 for (i=1; i<n; i++) { 1277 v = aa + 49*ai[i]; 1278 vi = aj + ai[i]; 1279 nz = diag[i] - ai[i]; 1280 idx = 7*i; 1281 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1282 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1283 s7 = b[6+idx]; 1284 while (nz--) { 1285 jdx = 7*(*vi++); 1286 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1287 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1288 x7 = x[6+jdx]; 1289 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1290 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1291 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1292 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1293 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1294 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1295 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1296 v += 49; 1297 } 1298 x[idx] = s1; 1299 x[1+idx] = s2; 1300 x[2+idx] = s3; 1301 x[3+idx] = s4; 1302 x[4+idx] = s5; 1303 x[5+idx] = s6; 1304 x[6+idx] = s7; 1305 } 1306 /* backward solve the upper triangular */ 1307 for (i=n-1; i>=0; i--){ 1308 v = aa + 49*diag[i] + 49; 1309 vi = aj + diag[i] + 1; 1310 nz = ai[i+1] - diag[i] - 1; 1311 idt = 7*i; 1312 s1 = x[idt]; s2 = x[1+idt]; 1313 s3 = x[2+idt]; s4 = x[3+idt]; 1314 s5 = x[4+idt]; s6 = x[5+idt]; 1315 s7 = x[6+idt]; 1316 while (nz--) { 1317 idx = 7*(*vi++); 1318 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1319 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1320 x7 = x[6+idx]; 1321 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1322 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1323 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1324 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1325 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1326 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1327 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1328 v += 49; 1329 } 1330 v = aa + 49*diag[i]; 1331 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1332 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1333 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1334 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1335 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1336 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1337 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1338 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1339 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1340 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1341 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1342 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1343 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1344 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1345 } 1346 1347 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1348 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1349 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1350 PetscFunctionReturn(0); 1351 } 1352 1353 #undef __FUNCT__ 1354 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1355 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1356 { 1357 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1358 IS iscol=a->col,isrow=a->row; 1359 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1360 int *diag = a->diag; 1361 MatScalar *aa=a->a,*v; 1362 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1363 1364 PetscFunctionBegin; 1365 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1366 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1367 t = a->solve_work; 1368 1369 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1370 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1371 1372 /* forward solve the lower triangular */ 1373 idx = 6*(*r++); 1374 t[0] = b[idx]; t[1] = b[1+idx]; 1375 t[2] = b[2+idx]; t[3] = b[3+idx]; 1376 t[4] = b[4+idx]; t[5] = b[5+idx]; 1377 for (i=1; i<n; i++) { 1378 v = aa + 36*ai[i]; 1379 vi = aj + ai[i]; 1380 nz = diag[i] - ai[i]; 1381 idx = 6*(*r++); 1382 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1383 s5 = b[4+idx]; s6 = b[5+idx]; 1384 while (nz--) { 1385 idx = 6*(*vi++); 1386 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1387 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1388 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1389 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1390 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1391 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1392 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1393 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1394 v += 36; 1395 } 1396 idx = 6*i; 1397 t[idx] = s1;t[1+idx] = s2; 1398 t[2+idx] = s3;t[3+idx] = s4; 1399 t[4+idx] = s5;t[5+idx] = s6; 1400 } 1401 /* backward solve the upper triangular */ 1402 for (i=n-1; i>=0; i--){ 1403 v = aa + 36*diag[i] + 36; 1404 vi = aj + diag[i] + 1; 1405 nz = ai[i+1] - diag[i] - 1; 1406 idt = 6*i; 1407 s1 = t[idt]; s2 = t[1+idt]; 1408 s3 = t[2+idt];s4 = t[3+idt]; 1409 s5 = t[4+idt];s6 = t[5+idt]; 1410 while (nz--) { 1411 idx = 6*(*vi++); 1412 x1 = t[idx]; x2 = t[1+idx]; 1413 x3 = t[2+idx]; x4 = t[3+idx]; 1414 x5 = t[4+idx]; x6 = t[5+idx]; 1415 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1416 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1417 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1418 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1419 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1420 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1421 v += 36; 1422 } 1423 idc = 6*(*c--); 1424 v = aa + 36*diag[i]; 1425 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1426 v[18]*s4+v[24]*s5+v[30]*s6; 1427 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1428 v[19]*s4+v[25]*s5+v[31]*s6; 1429 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1430 v[20]*s4+v[26]*s5+v[32]*s6; 1431 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1432 v[21]*s4+v[27]*s5+v[33]*s6; 1433 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1434 v[22]*s4+v[28]*s5+v[34]*s6; 1435 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1436 v[23]*s4+v[29]*s5+v[35]*s6; 1437 } 1438 1439 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1440 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1441 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1442 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1443 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1444 PetscFunctionReturn(0); 1445 } 1446 1447 #undef __FUNCT__ 1448 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1449 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1450 { 1451 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1452 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1453 PetscErrorCode ierr; 1454 int *diag = a->diag,jdx; 1455 MatScalar *aa=a->a,*v; 1456 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1457 1458 PetscFunctionBegin; 1459 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1460 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1461 /* forward solve the lower triangular */ 1462 idx = 0; 1463 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1464 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1465 for (i=1; i<n; i++) { 1466 v = aa + 36*ai[i]; 1467 vi = aj + ai[i]; 1468 nz = diag[i] - ai[i]; 1469 idx = 6*i; 1470 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1471 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1472 while (nz--) { 1473 jdx = 6*(*vi++); 1474 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1475 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1476 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1477 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1478 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1479 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1480 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1481 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1482 v += 36; 1483 } 1484 x[idx] = s1; 1485 x[1+idx] = s2; 1486 x[2+idx] = s3; 1487 x[3+idx] = s4; 1488 x[4+idx] = s5; 1489 x[5+idx] = s6; 1490 } 1491 /* backward solve the upper triangular */ 1492 for (i=n-1; i>=0; i--){ 1493 v = aa + 36*diag[i] + 36; 1494 vi = aj + diag[i] + 1; 1495 nz = ai[i+1] - diag[i] - 1; 1496 idt = 6*i; 1497 s1 = x[idt]; s2 = x[1+idt]; 1498 s3 = x[2+idt]; s4 = x[3+idt]; 1499 s5 = x[4+idt]; s6 = x[5+idt]; 1500 while (nz--) { 1501 idx = 6*(*vi++); 1502 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1503 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1504 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1505 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1506 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1507 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1508 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1509 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1510 v += 36; 1511 } 1512 v = aa + 36*diag[i]; 1513 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1514 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1515 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1516 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1517 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1518 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1519 } 1520 1521 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1522 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1523 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1524 PetscFunctionReturn(0); 1525 } 1526 1527 #undef __FUNCT__ 1528 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1529 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1530 { 1531 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1532 IS iscol=a->col,isrow=a->row; 1533 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1534 int *diag = a->diag; 1535 MatScalar *aa=a->a,*v; 1536 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1537 1538 PetscFunctionBegin; 1539 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1540 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1541 t = a->solve_work; 1542 1543 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1544 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1545 1546 /* forward solve the lower triangular */ 1547 idx = 5*(*r++); 1548 t[0] = b[idx]; t[1] = b[1+idx]; 1549 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1550 for (i=1; i<n; i++) { 1551 v = aa + 25*ai[i]; 1552 vi = aj + ai[i]; 1553 nz = diag[i] - ai[i]; 1554 idx = 5*(*r++); 1555 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1556 s5 = b[4+idx]; 1557 while (nz--) { 1558 idx = 5*(*vi++); 1559 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1560 x4 = t[3+idx];x5 = t[4+idx]; 1561 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1562 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1563 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1564 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1565 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1566 v += 25; 1567 } 1568 idx = 5*i; 1569 t[idx] = s1;t[1+idx] = s2; 1570 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1571 } 1572 /* backward solve the upper triangular */ 1573 for (i=n-1; i>=0; i--){ 1574 v = aa + 25*diag[i] + 25; 1575 vi = aj + diag[i] + 1; 1576 nz = ai[i+1] - diag[i] - 1; 1577 idt = 5*i; 1578 s1 = t[idt]; s2 = t[1+idt]; 1579 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1580 while (nz--) { 1581 idx = 5*(*vi++); 1582 x1 = t[idx]; x2 = t[1+idx]; 1583 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1584 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1585 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1586 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1587 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1588 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1589 v += 25; 1590 } 1591 idc = 5*(*c--); 1592 v = aa + 25*diag[i]; 1593 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1594 v[15]*s4+v[20]*s5; 1595 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1596 v[16]*s4+v[21]*s5; 1597 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1598 v[17]*s4+v[22]*s5; 1599 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1600 v[18]*s4+v[23]*s5; 1601 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1602 v[19]*s4+v[24]*s5; 1603 } 1604 1605 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1606 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1607 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1608 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1609 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1610 PetscFunctionReturn(0); 1611 } 1612 1613 #undef __FUNCT__ 1614 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1615 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1616 { 1617 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1618 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1619 PetscErrorCode ierr; 1620 int *diag = a->diag,jdx; 1621 MatScalar *aa=a->a,*v; 1622 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1623 1624 PetscFunctionBegin; 1625 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1626 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1627 /* forward solve the lower triangular */ 1628 idx = 0; 1629 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1630 for (i=1; i<n; i++) { 1631 v = aa + 25*ai[i]; 1632 vi = aj + ai[i]; 1633 nz = diag[i] - ai[i]; 1634 idx = 5*i; 1635 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1636 while (nz--) { 1637 jdx = 5*(*vi++); 1638 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1639 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1640 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1641 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1642 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1643 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1644 v += 25; 1645 } 1646 x[idx] = s1; 1647 x[1+idx] = s2; 1648 x[2+idx] = s3; 1649 x[3+idx] = s4; 1650 x[4+idx] = s5; 1651 } 1652 /* backward solve the upper triangular */ 1653 for (i=n-1; i>=0; i--){ 1654 v = aa + 25*diag[i] + 25; 1655 vi = aj + diag[i] + 1; 1656 nz = ai[i+1] - diag[i] - 1; 1657 idt = 5*i; 1658 s1 = x[idt]; s2 = x[1+idt]; 1659 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1660 while (nz--) { 1661 idx = 5*(*vi++); 1662 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1663 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1664 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1665 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1666 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1667 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1668 v += 25; 1669 } 1670 v = aa + 25*diag[i]; 1671 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1672 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1673 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1674 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1675 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1676 } 1677 1678 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1679 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1680 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1681 PetscFunctionReturn(0); 1682 } 1683 1684 #undef __FUNCT__ 1685 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1686 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1687 { 1688 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1689 IS iscol=a->col,isrow=a->row; 1690 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1691 int *diag = a->diag; 1692 MatScalar *aa=a->a,*v; 1693 PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1694 1695 PetscFunctionBegin; 1696 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1697 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1698 t = a->solve_work; 1699 1700 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1701 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1702 1703 /* forward solve the lower triangular */ 1704 idx = 4*(*r++); 1705 t[0] = b[idx]; t[1] = b[1+idx]; 1706 t[2] = b[2+idx]; t[3] = b[3+idx]; 1707 for (i=1; i<n; i++) { 1708 v = aa + 16*ai[i]; 1709 vi = aj + ai[i]; 1710 nz = diag[i] - ai[i]; 1711 idx = 4*(*r++); 1712 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1713 while (nz--) { 1714 idx = 4*(*vi++); 1715 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1716 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1717 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1718 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1719 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1720 v += 16; 1721 } 1722 idx = 4*i; 1723 t[idx] = s1;t[1+idx] = s2; 1724 t[2+idx] = s3;t[3+idx] = s4; 1725 } 1726 /* backward solve the upper triangular */ 1727 for (i=n-1; i>=0; i--){ 1728 v = aa + 16*diag[i] + 16; 1729 vi = aj + diag[i] + 1; 1730 nz = ai[i+1] - diag[i] - 1; 1731 idt = 4*i; 1732 s1 = t[idt]; s2 = t[1+idt]; 1733 s3 = t[2+idt];s4 = t[3+idt]; 1734 while (nz--) { 1735 idx = 4*(*vi++); 1736 x1 = t[idx]; x2 = t[1+idx]; 1737 x3 = t[2+idx]; x4 = t[3+idx]; 1738 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1739 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1740 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1741 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1742 v += 16; 1743 } 1744 idc = 4*(*c--); 1745 v = aa + 16*diag[i]; 1746 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1747 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1748 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1749 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1750 } 1751 1752 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1753 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1754 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1755 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1756 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1757 PetscFunctionReturn(0); 1758 } 1759 1760 #undef __FUNCT__ 1761 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1762 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1763 { 1764 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1765 IS iscol=a->col,isrow=a->row; 1766 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1767 int *diag = a->diag; 1768 MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1769 PetscScalar *x,*b; 1770 1771 PetscFunctionBegin; 1772 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1773 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1774 t = (MatScalar *)a->solve_work; 1775 1776 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1777 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1778 1779 /* forward solve the lower triangular */ 1780 idx = 4*(*r++); 1781 t[0] = (MatScalar)b[idx]; 1782 t[1] = (MatScalar)b[1+idx]; 1783 t[2] = (MatScalar)b[2+idx]; 1784 t[3] = (MatScalar)b[3+idx]; 1785 for (i=1; i<n; i++) { 1786 v = aa + 16*ai[i]; 1787 vi = aj + ai[i]; 1788 nz = diag[i] - ai[i]; 1789 idx = 4*(*r++); 1790 s1 = (MatScalar)b[idx]; 1791 s2 = (MatScalar)b[1+idx]; 1792 s3 = (MatScalar)b[2+idx]; 1793 s4 = (MatScalar)b[3+idx]; 1794 while (nz--) { 1795 idx = 4*(*vi++); 1796 x1 = t[idx]; 1797 x2 = t[1+idx]; 1798 x3 = t[2+idx]; 1799 x4 = t[3+idx]; 1800 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1801 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1802 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1803 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1804 v += 16; 1805 } 1806 idx = 4*i; 1807 t[idx] = s1; 1808 t[1+idx] = s2; 1809 t[2+idx] = s3; 1810 t[3+idx] = s4; 1811 } 1812 /* backward solve the upper triangular */ 1813 for (i=n-1; i>=0; i--){ 1814 v = aa + 16*diag[i] + 16; 1815 vi = aj + diag[i] + 1; 1816 nz = ai[i+1] - diag[i] - 1; 1817 idt = 4*i; 1818 s1 = t[idt]; 1819 s2 = t[1+idt]; 1820 s3 = t[2+idt]; 1821 s4 = t[3+idt]; 1822 while (nz--) { 1823 idx = 4*(*vi++); 1824 x1 = t[idx]; 1825 x2 = t[1+idx]; 1826 x3 = t[2+idx]; 1827 x4 = t[3+idx]; 1828 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1829 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1830 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1831 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1832 v += 16; 1833 } 1834 idc = 4*(*c--); 1835 v = aa + 16*diag[i]; 1836 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1837 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1838 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1839 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1840 x[idc] = (PetscScalar)t[idt]; 1841 x[1+idc] = (PetscScalar)t[1+idt]; 1842 x[2+idc] = (PetscScalar)t[2+idt]; 1843 x[3+idc] = (PetscScalar)t[3+idt]; 1844 } 1845 1846 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1847 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1848 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1849 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1850 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1851 PetscFunctionReturn(0); 1852 } 1853 1854 #if defined (PETSC_HAVE_SSE) 1855 1856 #include PETSC_HAVE_SSE 1857 1858 #undef __FUNCT__ 1859 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1860 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1861 { 1862 /* 1863 Note: This code uses demotion of double 1864 to float when performing the mixed-mode computation. 1865 This may not be numerically reasonable for all applications. 1866 */ 1867 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1868 IS iscol=a->col,isrow=a->row; 1869 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1870 int *diag = a->diag,ai16; 1871 MatScalar *aa=a->a,*v; 1872 PetscScalar *x,*b,*t; 1873 1874 /* Make space in temp stack for 16 Byte Aligned arrays */ 1875 float ssealignedspace[11],*tmps,*tmpx; 1876 unsigned long offset; 1877 1878 PetscFunctionBegin; 1879 SSE_SCOPE_BEGIN; 1880 1881 offset = (unsigned long)ssealignedspace % 16; 1882 if (offset) offset = (16 - offset)/4; 1883 tmps = &ssealignedspace[offset]; 1884 tmpx = &ssealignedspace[offset+4]; 1885 PREFETCH_NTA(aa+16*ai[1]); 1886 1887 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1888 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1889 t = a->solve_work; 1890 1891 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1892 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1893 1894 /* forward solve the lower triangular */ 1895 idx = 4*(*r++); 1896 t[0] = b[idx]; t[1] = b[1+idx]; 1897 t[2] = b[2+idx]; t[3] = b[3+idx]; 1898 v = aa + 16*ai[1]; 1899 1900 for (i=1; i<n;) { 1901 PREFETCH_NTA(&v[8]); 1902 vi = aj + ai[i]; 1903 nz = diag[i] - ai[i]; 1904 idx = 4*(*r++); 1905 1906 /* Demote sum from double to float */ 1907 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1908 LOAD_PS(tmps,XMM7); 1909 1910 while (nz--) { 1911 PREFETCH_NTA(&v[16]); 1912 idx = 4*(*vi++); 1913 1914 /* Demote solution (so far) from double to float */ 1915 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1916 1917 /* 4x4 Matrix-Vector product with negative accumulation: */ 1918 SSE_INLINE_BEGIN_2(tmpx,v) 1919 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1920 1921 /* First Column */ 1922 SSE_COPY_PS(XMM0,XMM6) 1923 SSE_SHUFFLE(XMM0,XMM0,0x00) 1924 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1925 SSE_SUB_PS(XMM7,XMM0) 1926 1927 /* Second Column */ 1928 SSE_COPY_PS(XMM1,XMM6) 1929 SSE_SHUFFLE(XMM1,XMM1,0x55) 1930 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1931 SSE_SUB_PS(XMM7,XMM1) 1932 1933 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1934 1935 /* Third Column */ 1936 SSE_COPY_PS(XMM2,XMM6) 1937 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1938 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1939 SSE_SUB_PS(XMM7,XMM2) 1940 1941 /* Fourth Column */ 1942 SSE_COPY_PS(XMM3,XMM6) 1943 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1944 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1945 SSE_SUB_PS(XMM7,XMM3) 1946 SSE_INLINE_END_2 1947 1948 v += 16; 1949 } 1950 idx = 4*i; 1951 v = aa + 16*ai[++i]; 1952 PREFETCH_NTA(v); 1953 STORE_PS(tmps,XMM7); 1954 1955 /* Promote result from float to double */ 1956 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1957 } 1958 /* backward solve the upper triangular */ 1959 idt = 4*(n-1); 1960 ai16 = 16*diag[n-1]; 1961 v = aa + ai16 + 16; 1962 for (i=n-1; i>=0;){ 1963 PREFETCH_NTA(&v[8]); 1964 vi = aj + diag[i] + 1; 1965 nz = ai[i+1] - diag[i] - 1; 1966 1967 /* Demote accumulator from double to float */ 1968 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1969 LOAD_PS(tmps,XMM7); 1970 1971 while (nz--) { 1972 PREFETCH_NTA(&v[16]); 1973 idx = 4*(*vi++); 1974 1975 /* Demote solution (so far) from double to float */ 1976 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 1977 1978 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1979 SSE_INLINE_BEGIN_2(tmpx,v) 1980 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1981 1982 /* First Column */ 1983 SSE_COPY_PS(XMM0,XMM6) 1984 SSE_SHUFFLE(XMM0,XMM0,0x00) 1985 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1986 SSE_SUB_PS(XMM7,XMM0) 1987 1988 /* Second Column */ 1989 SSE_COPY_PS(XMM1,XMM6) 1990 SSE_SHUFFLE(XMM1,XMM1,0x55) 1991 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1992 SSE_SUB_PS(XMM7,XMM1) 1993 1994 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1995 1996 /* Third Column */ 1997 SSE_COPY_PS(XMM2,XMM6) 1998 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1999 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2000 SSE_SUB_PS(XMM7,XMM2) 2001 2002 /* Fourth Column */ 2003 SSE_COPY_PS(XMM3,XMM6) 2004 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2005 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2006 SSE_SUB_PS(XMM7,XMM3) 2007 SSE_INLINE_END_2 2008 v += 16; 2009 } 2010 v = aa + ai16; 2011 ai16 = 16*diag[--i]; 2012 PREFETCH_NTA(aa+ai16+16); 2013 /* 2014 Scale the result by the diagonal 4x4 block, 2015 which was inverted as part of the factorization 2016 */ 2017 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2018 /* First Column */ 2019 SSE_COPY_PS(XMM0,XMM7) 2020 SSE_SHUFFLE(XMM0,XMM0,0x00) 2021 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2022 2023 /* Second Column */ 2024 SSE_COPY_PS(XMM1,XMM7) 2025 SSE_SHUFFLE(XMM1,XMM1,0x55) 2026 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2027 SSE_ADD_PS(XMM0,XMM1) 2028 2029 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2030 2031 /* Third Column */ 2032 SSE_COPY_PS(XMM2,XMM7) 2033 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2034 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2035 SSE_ADD_PS(XMM0,XMM2) 2036 2037 /* Fourth Column */ 2038 SSE_COPY_PS(XMM3,XMM7) 2039 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2040 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2041 SSE_ADD_PS(XMM0,XMM3) 2042 2043 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2044 SSE_INLINE_END_3 2045 2046 /* Promote solution from float to double */ 2047 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2048 2049 /* Apply reordering to t and stream into x. */ 2050 /* This way, x doesn't pollute the cache. */ 2051 /* Be careful with size: 2 doubles = 4 floats! */ 2052 idc = 4*(*c--); 2053 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2054 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2055 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2056 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2057 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2058 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2059 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2060 SSE_INLINE_END_2 2061 v = aa + ai16 + 16; 2062 idt -= 4; 2063 } 2064 2065 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2066 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2067 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2068 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2069 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2070 SSE_SCOPE_END; 2071 PetscFunctionReturn(0); 2072 } 2073 2074 #endif 2075 2076 2077 /* 2078 Special case where the matrix was ILU(0) factored in the natural 2079 ordering. This eliminates the need for the column and row permutation. 2080 */ 2081 #undef __FUNCT__ 2082 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2083 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2084 { 2085 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2086 int n=a->mbs,*ai=a->i,*aj=a->j; 2087 PetscErrorCode ierr; 2088 int *diag = a->diag; 2089 MatScalar *aa=a->a; 2090 PetscScalar *x,*b; 2091 2092 PetscFunctionBegin; 2093 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2094 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2095 2096 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2097 { 2098 static PetscScalar w[2000]; /* very BAD need to fix */ 2099 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2100 } 2101 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2102 { 2103 static PetscScalar w[2000]; /* very BAD need to fix */ 2104 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2105 } 2106 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2107 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2108 #else 2109 { 2110 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2111 MatScalar *v; 2112 int jdx,idt,idx,nz,*vi,i,ai16; 2113 2114 /* forward solve the lower triangular */ 2115 idx = 0; 2116 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2117 for (i=1; i<n; i++) { 2118 v = aa + 16*ai[i]; 2119 vi = aj + ai[i]; 2120 nz = diag[i] - ai[i]; 2121 idx += 4; 2122 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2123 while (nz--) { 2124 jdx = 4*(*vi++); 2125 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2126 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2127 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2128 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2129 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2130 v += 16; 2131 } 2132 x[idx] = s1; 2133 x[1+idx] = s2; 2134 x[2+idx] = s3; 2135 x[3+idx] = s4; 2136 } 2137 /* backward solve the upper triangular */ 2138 idt = 4*(n-1); 2139 for (i=n-1; i>=0; i--){ 2140 ai16 = 16*diag[i]; 2141 v = aa + ai16 + 16; 2142 vi = aj + diag[i] + 1; 2143 nz = ai[i+1] - diag[i] - 1; 2144 s1 = x[idt]; s2 = x[1+idt]; 2145 s3 = x[2+idt];s4 = x[3+idt]; 2146 while (nz--) { 2147 idx = 4*(*vi++); 2148 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2149 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2150 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2151 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2152 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2153 v += 16; 2154 } 2155 v = aa + ai16; 2156 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2157 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2158 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2159 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2160 idt -= 4; 2161 } 2162 } 2163 #endif 2164 2165 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2166 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2167 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2168 PetscFunctionReturn(0); 2169 } 2170 2171 #undef __FUNCT__ 2172 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2173 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2174 { 2175 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2176 int n=a->mbs,*ai=a->i,*aj=a->j; 2177 PetscErrorCode ierr; 2178 int *diag = a->diag; 2179 MatScalar *aa=a->a; 2180 PetscScalar *x,*b; 2181 2182 PetscFunctionBegin; 2183 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2184 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2185 2186 { 2187 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2188 MatScalar *v,*t=(MatScalar *)x; 2189 int jdx,idt,idx,nz,*vi,i,ai16; 2190 2191 /* forward solve the lower triangular */ 2192 idx = 0; 2193 t[0] = (MatScalar)b[0]; 2194 t[1] = (MatScalar)b[1]; 2195 t[2] = (MatScalar)b[2]; 2196 t[3] = (MatScalar)b[3]; 2197 for (i=1; i<n; i++) { 2198 v = aa + 16*ai[i]; 2199 vi = aj + ai[i]; 2200 nz = diag[i] - ai[i]; 2201 idx += 4; 2202 s1 = (MatScalar)b[idx]; 2203 s2 = (MatScalar)b[1+idx]; 2204 s3 = (MatScalar)b[2+idx]; 2205 s4 = (MatScalar)b[3+idx]; 2206 while (nz--) { 2207 jdx = 4*(*vi++); 2208 x1 = t[jdx]; 2209 x2 = t[1+jdx]; 2210 x3 = t[2+jdx]; 2211 x4 = t[3+jdx]; 2212 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2213 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2214 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2215 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2216 v += 16; 2217 } 2218 t[idx] = s1; 2219 t[1+idx] = s2; 2220 t[2+idx] = s3; 2221 t[3+idx] = s4; 2222 } 2223 /* backward solve the upper triangular */ 2224 idt = 4*(n-1); 2225 for (i=n-1; i>=0; i--){ 2226 ai16 = 16*diag[i]; 2227 v = aa + ai16 + 16; 2228 vi = aj + diag[i] + 1; 2229 nz = ai[i+1] - diag[i] - 1; 2230 s1 = t[idt]; 2231 s2 = t[1+idt]; 2232 s3 = t[2+idt]; 2233 s4 = t[3+idt]; 2234 while (nz--) { 2235 idx = 4*(*vi++); 2236 x1 = (MatScalar)x[idx]; 2237 x2 = (MatScalar)x[1+idx]; 2238 x3 = (MatScalar)x[2+idx]; 2239 x4 = (MatScalar)x[3+idx]; 2240 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2241 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2242 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2243 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2244 v += 16; 2245 } 2246 v = aa + ai16; 2247 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2248 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2249 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2250 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2251 idt -= 4; 2252 } 2253 } 2254 2255 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2256 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2257 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2258 PetscFunctionReturn(0); 2259 } 2260 2261 #if defined (PETSC_HAVE_SSE) 2262 2263 #include PETSC_HAVE_SSE 2264 #undef __FUNCT__ 2265 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2266 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2267 { 2268 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2269 unsigned short *aj=(unsigned short *)a->j; 2270 PetscErrorCode ierr; 2271 int *ai=a->i,n=a->mbs,*diag = a->diag; 2272 MatScalar *aa=a->a; 2273 PetscScalar *x,*b; 2274 2275 PetscFunctionBegin; 2276 SSE_SCOPE_BEGIN; 2277 /* 2278 Note: This code currently uses demotion of double 2279 to float when performing the mixed-mode computation. 2280 This may not be numerically reasonable for all applications. 2281 */ 2282 PREFETCH_NTA(aa+16*ai[1]); 2283 2284 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2285 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2286 { 2287 /* x will first be computed in single precision then promoted inplace to double */ 2288 MatScalar *v,*t=(MatScalar *)x; 2289 int nz,i,idt,ai16; 2290 unsigned int jdx,idx; 2291 unsigned short *vi; 2292 /* Forward solve the lower triangular factor. */ 2293 2294 /* First block is the identity. */ 2295 idx = 0; 2296 CONVERT_DOUBLE4_FLOAT4(t,b); 2297 v = aa + 16*((unsigned int)ai[1]); 2298 2299 for (i=1; i<n;) { 2300 PREFETCH_NTA(&v[8]); 2301 vi = aj + ai[i]; 2302 nz = diag[i] - ai[i]; 2303 idx += 4; 2304 2305 /* Demote RHS from double to float. */ 2306 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2307 LOAD_PS(&t[idx],XMM7); 2308 2309 while (nz--) { 2310 PREFETCH_NTA(&v[16]); 2311 jdx = 4*((unsigned int)(*vi++)); 2312 2313 /* 4x4 Matrix-Vector product with negative accumulation: */ 2314 SSE_INLINE_BEGIN_2(&t[jdx],v) 2315 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2316 2317 /* First Column */ 2318 SSE_COPY_PS(XMM0,XMM6) 2319 SSE_SHUFFLE(XMM0,XMM0,0x00) 2320 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2321 SSE_SUB_PS(XMM7,XMM0) 2322 2323 /* Second Column */ 2324 SSE_COPY_PS(XMM1,XMM6) 2325 SSE_SHUFFLE(XMM1,XMM1,0x55) 2326 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2327 SSE_SUB_PS(XMM7,XMM1) 2328 2329 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2330 2331 /* Third Column */ 2332 SSE_COPY_PS(XMM2,XMM6) 2333 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2334 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2335 SSE_SUB_PS(XMM7,XMM2) 2336 2337 /* Fourth Column */ 2338 SSE_COPY_PS(XMM3,XMM6) 2339 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2340 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2341 SSE_SUB_PS(XMM7,XMM3) 2342 SSE_INLINE_END_2 2343 2344 v += 16; 2345 } 2346 v = aa + 16*ai[++i]; 2347 PREFETCH_NTA(v); 2348 STORE_PS(&t[idx],XMM7); 2349 } 2350 2351 /* Backward solve the upper triangular factor.*/ 2352 2353 idt = 4*(n-1); 2354 ai16 = 16*diag[n-1]; 2355 v = aa + ai16 + 16; 2356 for (i=n-1; i>=0;){ 2357 PREFETCH_NTA(&v[8]); 2358 vi = aj + diag[i] + 1; 2359 nz = ai[i+1] - diag[i] - 1; 2360 2361 LOAD_PS(&t[idt],XMM7); 2362 2363 while (nz--) { 2364 PREFETCH_NTA(&v[16]); 2365 idx = 4*((unsigned int)(*vi++)); 2366 2367 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2368 SSE_INLINE_BEGIN_2(&t[idx],v) 2369 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2370 2371 /* First Column */ 2372 SSE_COPY_PS(XMM0,XMM6) 2373 SSE_SHUFFLE(XMM0,XMM0,0x00) 2374 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2375 SSE_SUB_PS(XMM7,XMM0) 2376 2377 /* Second Column */ 2378 SSE_COPY_PS(XMM1,XMM6) 2379 SSE_SHUFFLE(XMM1,XMM1,0x55) 2380 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2381 SSE_SUB_PS(XMM7,XMM1) 2382 2383 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2384 2385 /* Third Column */ 2386 SSE_COPY_PS(XMM2,XMM6) 2387 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2388 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2389 SSE_SUB_PS(XMM7,XMM2) 2390 2391 /* Fourth Column */ 2392 SSE_COPY_PS(XMM3,XMM6) 2393 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2394 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2395 SSE_SUB_PS(XMM7,XMM3) 2396 SSE_INLINE_END_2 2397 v += 16; 2398 } 2399 v = aa + ai16; 2400 ai16 = 16*diag[--i]; 2401 PREFETCH_NTA(aa+ai16+16); 2402 /* 2403 Scale the result by the diagonal 4x4 block, 2404 which was inverted as part of the factorization 2405 */ 2406 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2407 /* First Column */ 2408 SSE_COPY_PS(XMM0,XMM7) 2409 SSE_SHUFFLE(XMM0,XMM0,0x00) 2410 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2411 2412 /* Second Column */ 2413 SSE_COPY_PS(XMM1,XMM7) 2414 SSE_SHUFFLE(XMM1,XMM1,0x55) 2415 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2416 SSE_ADD_PS(XMM0,XMM1) 2417 2418 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2419 2420 /* Third Column */ 2421 SSE_COPY_PS(XMM2,XMM7) 2422 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2423 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2424 SSE_ADD_PS(XMM0,XMM2) 2425 2426 /* Fourth Column */ 2427 SSE_COPY_PS(XMM3,XMM7) 2428 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2429 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2430 SSE_ADD_PS(XMM0,XMM3) 2431 2432 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2433 SSE_INLINE_END_3 2434 2435 v = aa + ai16 + 16; 2436 idt -= 4; 2437 } 2438 2439 /* Convert t from single precision back to double precision (inplace)*/ 2440 idt = 4*(n-1); 2441 for (i=n-1;i>=0;i--) { 2442 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2443 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2444 PetscScalar *xtemp=&x[idt]; 2445 MatScalar *ttemp=&t[idt]; 2446 xtemp[3] = (PetscScalar)ttemp[3]; 2447 xtemp[2] = (PetscScalar)ttemp[2]; 2448 xtemp[1] = (PetscScalar)ttemp[1]; 2449 xtemp[0] = (PetscScalar)ttemp[0]; 2450 idt -= 4; 2451 } 2452 2453 } /* End of artificial scope. */ 2454 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2455 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2456 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2457 SSE_SCOPE_END; 2458 PetscFunctionReturn(0); 2459 } 2460 2461 #undef __FUNCT__ 2462 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2463 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2464 { 2465 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2466 int *aj=a->j; 2467 PetscErrorCode ierr; 2468 int *ai=a->i,n=a->mbs,*diag = a->diag; 2469 MatScalar *aa=a->a; 2470 PetscScalar *x,*b; 2471 2472 PetscFunctionBegin; 2473 SSE_SCOPE_BEGIN; 2474 /* 2475 Note: This code currently uses demotion of double 2476 to float when performing the mixed-mode computation. 2477 This may not be numerically reasonable for all applications. 2478 */ 2479 PREFETCH_NTA(aa+16*ai[1]); 2480 2481 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2482 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2483 { 2484 /* x will first be computed in single precision then promoted inplace to double */ 2485 MatScalar *v,*t=(MatScalar *)x; 2486 int nz,i,idt,ai16; 2487 int jdx,idx; 2488 int *vi; 2489 /* Forward solve the lower triangular factor. */ 2490 2491 /* First block is the identity. */ 2492 idx = 0; 2493 CONVERT_DOUBLE4_FLOAT4(t,b); 2494 v = aa + 16*ai[1]; 2495 2496 for (i=1; i<n;) { 2497 PREFETCH_NTA(&v[8]); 2498 vi = aj + ai[i]; 2499 nz = diag[i] - ai[i]; 2500 idx += 4; 2501 2502 /* Demote RHS from double to float. */ 2503 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2504 LOAD_PS(&t[idx],XMM7); 2505 2506 while (nz--) { 2507 PREFETCH_NTA(&v[16]); 2508 jdx = 4*(*vi++); 2509 /* jdx = *vi++; */ 2510 2511 /* 4x4 Matrix-Vector product with negative accumulation: */ 2512 SSE_INLINE_BEGIN_2(&t[jdx],v) 2513 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2514 2515 /* First Column */ 2516 SSE_COPY_PS(XMM0,XMM6) 2517 SSE_SHUFFLE(XMM0,XMM0,0x00) 2518 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2519 SSE_SUB_PS(XMM7,XMM0) 2520 2521 /* Second Column */ 2522 SSE_COPY_PS(XMM1,XMM6) 2523 SSE_SHUFFLE(XMM1,XMM1,0x55) 2524 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2525 SSE_SUB_PS(XMM7,XMM1) 2526 2527 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2528 2529 /* Third Column */ 2530 SSE_COPY_PS(XMM2,XMM6) 2531 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2532 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2533 SSE_SUB_PS(XMM7,XMM2) 2534 2535 /* Fourth Column */ 2536 SSE_COPY_PS(XMM3,XMM6) 2537 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2538 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2539 SSE_SUB_PS(XMM7,XMM3) 2540 SSE_INLINE_END_2 2541 2542 v += 16; 2543 } 2544 v = aa + 16*ai[++i]; 2545 PREFETCH_NTA(v); 2546 STORE_PS(&t[idx],XMM7); 2547 } 2548 2549 /* Backward solve the upper triangular factor.*/ 2550 2551 idt = 4*(n-1); 2552 ai16 = 16*diag[n-1]; 2553 v = aa + ai16 + 16; 2554 for (i=n-1; i>=0;){ 2555 PREFETCH_NTA(&v[8]); 2556 vi = aj + diag[i] + 1; 2557 nz = ai[i+1] - diag[i] - 1; 2558 2559 LOAD_PS(&t[idt],XMM7); 2560 2561 while (nz--) { 2562 PREFETCH_NTA(&v[16]); 2563 idx = 4*(*vi++); 2564 /* idx = *vi++; */ 2565 2566 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2567 SSE_INLINE_BEGIN_2(&t[idx],v) 2568 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2569 2570 /* First Column */ 2571 SSE_COPY_PS(XMM0,XMM6) 2572 SSE_SHUFFLE(XMM0,XMM0,0x00) 2573 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2574 SSE_SUB_PS(XMM7,XMM0) 2575 2576 /* Second Column */ 2577 SSE_COPY_PS(XMM1,XMM6) 2578 SSE_SHUFFLE(XMM1,XMM1,0x55) 2579 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2580 SSE_SUB_PS(XMM7,XMM1) 2581 2582 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2583 2584 /* Third Column */ 2585 SSE_COPY_PS(XMM2,XMM6) 2586 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2587 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2588 SSE_SUB_PS(XMM7,XMM2) 2589 2590 /* Fourth Column */ 2591 SSE_COPY_PS(XMM3,XMM6) 2592 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2593 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2594 SSE_SUB_PS(XMM7,XMM3) 2595 SSE_INLINE_END_2 2596 v += 16; 2597 } 2598 v = aa + ai16; 2599 ai16 = 16*diag[--i]; 2600 PREFETCH_NTA(aa+ai16+16); 2601 /* 2602 Scale the result by the diagonal 4x4 block, 2603 which was inverted as part of the factorization 2604 */ 2605 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2606 /* First Column */ 2607 SSE_COPY_PS(XMM0,XMM7) 2608 SSE_SHUFFLE(XMM0,XMM0,0x00) 2609 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2610 2611 /* Second Column */ 2612 SSE_COPY_PS(XMM1,XMM7) 2613 SSE_SHUFFLE(XMM1,XMM1,0x55) 2614 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2615 SSE_ADD_PS(XMM0,XMM1) 2616 2617 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2618 2619 /* Third Column */ 2620 SSE_COPY_PS(XMM2,XMM7) 2621 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2622 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2623 SSE_ADD_PS(XMM0,XMM2) 2624 2625 /* Fourth Column */ 2626 SSE_COPY_PS(XMM3,XMM7) 2627 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2628 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2629 SSE_ADD_PS(XMM0,XMM3) 2630 2631 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2632 SSE_INLINE_END_3 2633 2634 v = aa + ai16 + 16; 2635 idt -= 4; 2636 } 2637 2638 /* Convert t from single precision back to double precision (inplace)*/ 2639 idt = 4*(n-1); 2640 for (i=n-1;i>=0;i--) { 2641 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2642 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2643 PetscScalar *xtemp=&x[idt]; 2644 MatScalar *ttemp=&t[idt]; 2645 xtemp[3] = (PetscScalar)ttemp[3]; 2646 xtemp[2] = (PetscScalar)ttemp[2]; 2647 xtemp[1] = (PetscScalar)ttemp[1]; 2648 xtemp[0] = (PetscScalar)ttemp[0]; 2649 idt -= 4; 2650 } 2651 2652 } /* End of artificial scope. */ 2653 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2654 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2655 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2656 SSE_SCOPE_END; 2657 PetscFunctionReturn(0); 2658 } 2659 2660 #endif 2661 #undef __FUNCT__ 2662 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2663 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2664 { 2665 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2666 IS iscol=a->col,isrow=a->row; 2667 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2668 int *diag = a->diag; 2669 MatScalar *aa=a->a,*v; 2670 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 2671 2672 PetscFunctionBegin; 2673 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2674 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2675 t = a->solve_work; 2676 2677 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2678 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2679 2680 /* forward solve the lower triangular */ 2681 idx = 3*(*r++); 2682 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2683 for (i=1; i<n; i++) { 2684 v = aa + 9*ai[i]; 2685 vi = aj + ai[i]; 2686 nz = diag[i] - ai[i]; 2687 idx = 3*(*r++); 2688 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2689 while (nz--) { 2690 idx = 3*(*vi++); 2691 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2692 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2693 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2694 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2695 v += 9; 2696 } 2697 idx = 3*i; 2698 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2699 } 2700 /* backward solve the upper triangular */ 2701 for (i=n-1; i>=0; i--){ 2702 v = aa + 9*diag[i] + 9; 2703 vi = aj + diag[i] + 1; 2704 nz = ai[i+1] - diag[i] - 1; 2705 idt = 3*i; 2706 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2707 while (nz--) { 2708 idx = 3*(*vi++); 2709 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2710 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2711 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2712 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2713 v += 9; 2714 } 2715 idc = 3*(*c--); 2716 v = aa + 9*diag[i]; 2717 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2718 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2719 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2720 } 2721 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2722 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2723 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2724 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2725 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2726 PetscFunctionReturn(0); 2727 } 2728 2729 /* 2730 Special case where the matrix was ILU(0) factored in the natural 2731 ordering. This eliminates the need for the column and row permutation. 2732 */ 2733 #undef __FUNCT__ 2734 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2735 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2736 { 2737 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2738 int n=a->mbs,*ai=a->i,*aj=a->j; 2739 PetscErrorCode ierr; 2740 int *diag = a->diag; 2741 MatScalar *aa=a->a,*v; 2742 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 2743 int jdx,idt,idx,nz,*vi,i; 2744 2745 PetscFunctionBegin; 2746 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2747 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2748 2749 2750 /* forward solve the lower triangular */ 2751 idx = 0; 2752 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2753 for (i=1; i<n; i++) { 2754 v = aa + 9*ai[i]; 2755 vi = aj + ai[i]; 2756 nz = diag[i] - ai[i]; 2757 idx += 3; 2758 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2759 while (nz--) { 2760 jdx = 3*(*vi++); 2761 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2762 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2763 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2764 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2765 v += 9; 2766 } 2767 x[idx] = s1; 2768 x[1+idx] = s2; 2769 x[2+idx] = s3; 2770 } 2771 /* backward solve the upper triangular */ 2772 for (i=n-1; i>=0; i--){ 2773 v = aa + 9*diag[i] + 9; 2774 vi = aj + diag[i] + 1; 2775 nz = ai[i+1] - diag[i] - 1; 2776 idt = 3*i; 2777 s1 = x[idt]; s2 = x[1+idt]; 2778 s3 = x[2+idt]; 2779 while (nz--) { 2780 idx = 3*(*vi++); 2781 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2782 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2783 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2784 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2785 v += 9; 2786 } 2787 v = aa + 9*diag[i]; 2788 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2789 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2790 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2791 } 2792 2793 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2794 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2795 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2796 PetscFunctionReturn(0); 2797 } 2798 2799 #undef __FUNCT__ 2800 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2801 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2802 { 2803 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2804 IS iscol=a->col,isrow=a->row; 2805 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2806 int *diag = a->diag; 2807 MatScalar *aa=a->a,*v; 2808 PetscScalar *x,*b,s1,s2,x1,x2,*t; 2809 2810 PetscFunctionBegin; 2811 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2812 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2813 t = a->solve_work; 2814 2815 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2816 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2817 2818 /* forward solve the lower triangular */ 2819 idx = 2*(*r++); 2820 t[0] = b[idx]; t[1] = b[1+idx]; 2821 for (i=1; i<n; i++) { 2822 v = aa + 4*ai[i]; 2823 vi = aj + ai[i]; 2824 nz = diag[i] - ai[i]; 2825 idx = 2*(*r++); 2826 s1 = b[idx]; s2 = b[1+idx]; 2827 while (nz--) { 2828 idx = 2*(*vi++); 2829 x1 = t[idx]; x2 = t[1+idx]; 2830 s1 -= v[0]*x1 + v[2]*x2; 2831 s2 -= v[1]*x1 + v[3]*x2; 2832 v += 4; 2833 } 2834 idx = 2*i; 2835 t[idx] = s1; t[1+idx] = s2; 2836 } 2837 /* backward solve the upper triangular */ 2838 for (i=n-1; i>=0; i--){ 2839 v = aa + 4*diag[i] + 4; 2840 vi = aj + diag[i] + 1; 2841 nz = ai[i+1] - diag[i] - 1; 2842 idt = 2*i; 2843 s1 = t[idt]; s2 = t[1+idt]; 2844 while (nz--) { 2845 idx = 2*(*vi++); 2846 x1 = t[idx]; x2 = t[1+idx]; 2847 s1 -= v[0]*x1 + v[2]*x2; 2848 s2 -= v[1]*x1 + v[3]*x2; 2849 v += 4; 2850 } 2851 idc = 2*(*c--); 2852 v = aa + 4*diag[i]; 2853 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2854 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2855 } 2856 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2857 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2858 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2859 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2860 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2861 PetscFunctionReturn(0); 2862 } 2863 2864 /* 2865 Special case where the matrix was ILU(0) factored in the natural 2866 ordering. This eliminates the need for the column and row permutation. 2867 */ 2868 #undef __FUNCT__ 2869 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2870 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2871 { 2872 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2873 int n=a->mbs,*ai=a->i,*aj=a->j; 2874 PetscErrorCode ierr; 2875 int *diag = a->diag; 2876 MatScalar *aa=a->a,*v; 2877 PetscScalar *x,*b,s1,s2,x1,x2; 2878 int jdx,idt,idx,nz,*vi,i; 2879 2880 PetscFunctionBegin; 2881 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2882 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2883 2884 /* forward solve the lower triangular */ 2885 idx = 0; 2886 x[0] = b[0]; x[1] = b[1]; 2887 for (i=1; i<n; i++) { 2888 v = aa + 4*ai[i]; 2889 vi = aj + ai[i]; 2890 nz = diag[i] - ai[i]; 2891 idx += 2; 2892 s1 = b[idx];s2 = b[1+idx]; 2893 while (nz--) { 2894 jdx = 2*(*vi++); 2895 x1 = x[jdx];x2 = x[1+jdx]; 2896 s1 -= v[0]*x1 + v[2]*x2; 2897 s2 -= v[1]*x1 + v[3]*x2; 2898 v += 4; 2899 } 2900 x[idx] = s1; 2901 x[1+idx] = s2; 2902 } 2903 /* backward solve the upper triangular */ 2904 for (i=n-1; i>=0; i--){ 2905 v = aa + 4*diag[i] + 4; 2906 vi = aj + diag[i] + 1; 2907 nz = ai[i+1] - diag[i] - 1; 2908 idt = 2*i; 2909 s1 = x[idt]; s2 = x[1+idt]; 2910 while (nz--) { 2911 idx = 2*(*vi++); 2912 x1 = x[idx]; x2 = x[1+idx]; 2913 s1 -= v[0]*x1 + v[2]*x2; 2914 s2 -= v[1]*x1 + v[3]*x2; 2915 v += 4; 2916 } 2917 v = aa + 4*diag[i]; 2918 x[idt] = v[0]*s1 + v[2]*s2; 2919 x[1+idt] = v[1]*s1 + v[3]*s2; 2920 } 2921 2922 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2923 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2924 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2925 PetscFunctionReturn(0); 2926 } 2927 2928 #undef __FUNCT__ 2929 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2930 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2931 { 2932 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2933 IS iscol=a->col,isrow=a->row; 2934 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2935 int *diag = a->diag; 2936 MatScalar *aa=a->a,*v; 2937 PetscScalar *x,*b,s1,*t; 2938 2939 PetscFunctionBegin; 2940 if (!n) PetscFunctionReturn(0); 2941 2942 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2943 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2944 t = a->solve_work; 2945 2946 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2947 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2948 2949 /* forward solve the lower triangular */ 2950 t[0] = b[*r++]; 2951 for (i=1; i<n; i++) { 2952 v = aa + ai[i]; 2953 vi = aj + ai[i]; 2954 nz = diag[i] - ai[i]; 2955 s1 = b[*r++]; 2956 while (nz--) { 2957 s1 -= (*v++)*t[*vi++]; 2958 } 2959 t[i] = s1; 2960 } 2961 /* backward solve the upper triangular */ 2962 for (i=n-1; i>=0; i--){ 2963 v = aa + diag[i] + 1; 2964 vi = aj + diag[i] + 1; 2965 nz = ai[i+1] - diag[i] - 1; 2966 s1 = t[i]; 2967 while (nz--) { 2968 s1 -= (*v++)*t[*vi++]; 2969 } 2970 x[*c--] = t[i] = aa[diag[i]]*s1; 2971 } 2972 2973 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2974 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2975 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2976 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2977 PetscLogFlops(2*1*(a->nz) - A->n); 2978 PetscFunctionReturn(0); 2979 } 2980 /* 2981 Special case where the matrix was ILU(0) factored in the natural 2982 ordering. This eliminates the need for the column and row permutation. 2983 */ 2984 #undef __FUNCT__ 2985 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 2986 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2987 { 2988 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2989 int n=a->mbs,*ai=a->i,*aj=a->j; 2990 PetscErrorCode ierr; 2991 int *diag = a->diag; 2992 MatScalar *aa=a->a; 2993 PetscScalar *x,*b; 2994 PetscScalar s1,x1; 2995 MatScalar *v; 2996 int jdx,idt,idx,nz,*vi,i; 2997 2998 PetscFunctionBegin; 2999 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3000 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3001 3002 /* forward solve the lower triangular */ 3003 idx = 0; 3004 x[0] = b[0]; 3005 for (i=1; i<n; i++) { 3006 v = aa + ai[i]; 3007 vi = aj + ai[i]; 3008 nz = diag[i] - ai[i]; 3009 idx += 1; 3010 s1 = b[idx]; 3011 while (nz--) { 3012 jdx = *vi++; 3013 x1 = x[jdx]; 3014 s1 -= v[0]*x1; 3015 v += 1; 3016 } 3017 x[idx] = s1; 3018 } 3019 /* backward solve the upper triangular */ 3020 for (i=n-1; i>=0; i--){ 3021 v = aa + diag[i] + 1; 3022 vi = aj + diag[i] + 1; 3023 nz = ai[i+1] - diag[i] - 1; 3024 idt = i; 3025 s1 = x[idt]; 3026 while (nz--) { 3027 idx = *vi++; 3028 x1 = x[idx]; 3029 s1 -= v[0]*x1; 3030 v += 1; 3031 } 3032 v = aa + diag[i]; 3033 x[idt] = v[0]*s1; 3034 } 3035 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3036 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3037 PetscLogFlops(2*(a->nz) - A->n); 3038 PetscFunctionReturn(0); 3039 } 3040 3041 /* ----------------------------------------------------------------*/ 3042 /* 3043 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3044 except that the data structure of Mat_SeqAIJ is slightly different. 3045 Not a good example of code reuse. 3046 */ 3047 EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat); 3048 3049 #undef __FUNCT__ 3050 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3051 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact) 3052 { 3053 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3054 IS isicol; 3055 int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j; 3056 int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3057 int *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0; 3058 int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 3059 PetscTruth col_identity,row_identity; 3060 PetscReal f; 3061 3062 PetscFunctionBegin; 3063 f = info->fill; 3064 levels = (int)info->levels; 3065 diagonal_fill = (int)info->diagonal_fill; 3066 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3067 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3068 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3069 3070 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3071 ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 3072 (*fact)->factor = FACTOR_LU; 3073 b = (Mat_SeqBAIJ*)(*fact)->data; 3074 if (!b->diag) { 3075 ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3076 } 3077 ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3078 b->row = isrow; 3079 b->col = iscol; 3080 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3081 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3082 b->icol = isicol; 3083 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3084 ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3085 } else { /* general case perform the symbolic factorization */ 3086 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3087 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3088 3089 /* get new row pointers */ 3090 ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 3091 ainew[0] = 0; 3092 /* don't know how many column pointers are needed so estimate */ 3093 jmax = (int)(f*ai[n] + 1); 3094 ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 3095 /* ajfill is level of fill for each fill entry */ 3096 ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 3097 /* fill is a linked list of nonzeros in active row */ 3098 ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 3099 /* im is level for each filled value */ 3100 ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 3101 /* dloc is location of diagonal in factor */ 3102 ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 3103 dloc[0] = 0; 3104 for (prow=0; prow<n; prow++) { 3105 3106 /* copy prow into linked list */ 3107 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3108 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 3109 xi = aj + ai[r[prow]]; 3110 fill[n] = n; 3111 fill[prow] = -1; /* marker for diagonal entry */ 3112 while (nz--) { 3113 fm = n; 3114 idx = ic[*xi++]; 3115 do { 3116 m = fm; 3117 fm = fill[m]; 3118 } while (fm < idx); 3119 fill[m] = idx; 3120 fill[idx] = fm; 3121 im[idx] = 0; 3122 } 3123 3124 /* make sure diagonal entry is included */ 3125 if (diagonal_fill && fill[prow] == -1) { 3126 fm = n; 3127 while (fill[fm] < prow) fm = fill[fm]; 3128 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3129 fill[fm] = prow; 3130 im[prow] = 0; 3131 nzf++; 3132 dcount++; 3133 } 3134 3135 nzi = 0; 3136 row = fill[n]; 3137 while (row < prow) { 3138 incrlev = im[row] + 1; 3139 nz = dloc[row]; 3140 xi = ajnew + ainew[row] + nz + 1; 3141 flev = ajfill + ainew[row] + nz + 1; 3142 nnz = ainew[row+1] - ainew[row] - nz - 1; 3143 fm = row; 3144 while (nnz-- > 0) { 3145 idx = *xi++; 3146 if (*flev + incrlev > levels) { 3147 flev++; 3148 continue; 3149 } 3150 do { 3151 m = fm; 3152 fm = fill[m]; 3153 } while (fm < idx); 3154 if (fm != idx) { 3155 im[idx] = *flev + incrlev; 3156 fill[m] = idx; 3157 fill[idx] = fm; 3158 fm = idx; 3159 nzf++; 3160 } else { 3161 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3162 } 3163 flev++; 3164 } 3165 row = fill[row]; 3166 nzi++; 3167 } 3168 /* copy new filled row into permanent storage */ 3169 ainew[prow+1] = ainew[prow] + nzf; 3170 if (ainew[prow+1] > jmax) { 3171 3172 /* estimate how much additional space we will need */ 3173 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3174 /* just double the memory each time */ 3175 int maxadd = jmax; 3176 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3177 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3178 jmax += maxadd; 3179 3180 /* allocate a longer ajnew and ajfill */ 3181 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3182 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3183 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3184 ajnew = xi; 3185 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3186 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3187 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3188 ajfill = xi; 3189 reallocate++; /* count how many reallocations are needed */ 3190 } 3191 xi = ajnew + ainew[prow]; 3192 flev = ajfill + ainew[prow]; 3193 dloc[prow] = nzi; 3194 fm = fill[n]; 3195 while (nzf--) { 3196 *xi++ = fm; 3197 *flev++ = im[fm]; 3198 fm = fill[fm]; 3199 } 3200 /* make sure row has diagonal entry */ 3201 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3202 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 3203 try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 3204 } 3205 } 3206 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3207 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3208 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3209 ierr = PetscFree(fill);CHKERRQ(ierr); 3210 ierr = PetscFree(im);CHKERRQ(ierr); 3211 3212 { 3213 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3214 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af); 3215 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 3216 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 3217 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 3218 if (diagonal_fill) { 3219 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount); 3220 } 3221 } 3222 3223 /* put together the new matrix */ 3224 ierr = MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);CHKERRQ(ierr); 3225 ierr = MatSetType(*fact,A->type_name);CHKERRQ(ierr); 3226 ierr = MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);CHKERRQ(ierr); 3227 PetscLogObjectParent(*fact,isicol); 3228 b = (Mat_SeqBAIJ*)(*fact)->data; 3229 ierr = PetscFree(b->imax);CHKERRQ(ierr); 3230 b->singlemalloc = PETSC_FALSE; 3231 len = bs2*ainew[n]*sizeof(MatScalar); 3232 /* the next line frees the default space generated by the Create() */ 3233 ierr = PetscFree(b->a);CHKERRQ(ierr); 3234 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 3235 ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 3236 b->j = ajnew; 3237 b->i = ainew; 3238 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3239 b->diag = dloc; 3240 b->ilen = 0; 3241 b->imax = 0; 3242 b->row = isrow; 3243 b->col = iscol; 3244 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3245 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3246 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3247 b->icol = isicol; 3248 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3249 /* In b structure: Free imax, ilen, old a, old j. 3250 Allocate dloc, solve_work, new a, new j */ 3251 PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar)); 3252 b->maxnz = b->nz = ainew[n]; 3253 (*fact)->factor = FACTOR_LU; 3254 3255 (*fact)->info.factor_mallocs = reallocate; 3256 (*fact)->info.fill_ratio_given = f; 3257 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3258 } 3259 3260 if (row_identity && col_identity) { 3261 ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 3262 } 3263 PetscFunctionReturn(0); 3264 } 3265 3266 #undef __FUNCT__ 3267 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3268 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3269 { 3270 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3271 /* int i,*AJ=a->j,nz=a->nz; */ 3272 PetscFunctionBegin; 3273 /* Undo Column scaling */ 3274 /* while (nz--) { */ 3275 /* AJ[i] = AJ[i]/4; */ 3276 /* } */ 3277 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3278 A->ops->setunfactored = PETSC_NULL; 3279 PetscFunctionReturn(0); 3280 } 3281 3282 #undef __FUNCT__ 3283 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3284 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3285 { 3286 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3287 int *AJ=a->j,nz=a->nz; 3288 unsigned short *aj=(unsigned short *)AJ; 3289 PetscFunctionBegin; 3290 /* Is this really necessary? */ 3291 while (nz--) { 3292 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3293 } 3294 A->ops->setunfactored = PETSC_NULL; 3295 PetscFunctionReturn(0); 3296 } 3297 3298 #undef __FUNCT__ 3299 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3300 PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 3301 { 3302 /* 3303 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3304 with natural ordering 3305 */ 3306 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3307 3308 PetscFunctionBegin; 3309 inA->ops->solve = MatSolve_SeqBAIJ_Update; 3310 inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 3311 switch (a->bs) { 3312 case 1: 3313 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3314 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"); 3315 break; 3316 case 2: 3317 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3318 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"); 3319 break; 3320 case 3: 3321 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3322 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"); 3323 break; 3324 case 4: 3325 #if defined(PETSC_USE_MAT_SINGLE) 3326 { 3327 PetscTruth sse_enabled_local; 3328 PetscErrorCode ierr; 3329 ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3330 if (sse_enabled_local) { 3331 # if defined(PETSC_HAVE_SSE) 3332 int i,*AJ=a->j,nz=a->nz,n=a->mbs; 3333 if (n==(unsigned short)n) { 3334 unsigned short *aj=(unsigned short *)AJ; 3335 for (i=0;i<nz;i++) { 3336 aj[i] = (unsigned short)AJ[i]; 3337 } 3338 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3339 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3340 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n"); 3341 } else { 3342 /* Scale the column indices for easier indexing in MatSolve. */ 3343 /* for (i=0;i<nz;i++) { */ 3344 /* AJ[i] = AJ[i]*4; */ 3345 /* } */ 3346 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE; 3347 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 3348 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n"); 3349 } 3350 # else 3351 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3352 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3353 # endif 3354 } else { 3355 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3356 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3357 } 3358 } 3359 #else 3360 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3361 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3362 #endif 3363 break; 3364 case 5: 3365 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3366 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"); 3367 break; 3368 case 6: 3369 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3370 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"); 3371 break; 3372 case 7: 3373 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3374 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"); 3375 break; 3376 } 3377 PetscFunctionReturn(0); 3378 } 3379 3380 #undef __FUNCT__ 3381 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3382 PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A) 3383 { 3384 /* 3385 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3386 with natural ordering 3387 */ 3388 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3389 IS row = a->row, col = a->col; 3390 PetscTruth row_identity, col_identity; 3391 PetscTruth use_natural; 3392 PetscErrorCode ierr; 3393 3394 PetscFunctionBegin; 3395 3396 use_natural = PETSC_FALSE; 3397 if (row && col) { 3398 ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3399 ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3400 3401 if (row_identity && col_identity) { 3402 use_natural = PETSC_TRUE; 3403 } 3404 } else { 3405 use_natural = PETSC_TRUE; 3406 } 3407 3408 switch (a->bs) { 3409 case 1: 3410 if (use_natural) { 3411 A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3412 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3413 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"); 3414 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3415 } else { 3416 A->ops->solve = MatSolve_SeqBAIJ_1; 3417 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3418 } 3419 break; 3420 case 2: 3421 if (use_natural) { 3422 A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3423 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3424 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"); 3425 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3426 } else { 3427 A->ops->solve = MatSolve_SeqBAIJ_2; 3428 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3429 } 3430 break; 3431 case 3: 3432 if (use_natural) { 3433 A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3434 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3435 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"); 3436 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3437 } else { 3438 A->ops->solve = MatSolve_SeqBAIJ_3; 3439 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3440 } 3441 break; 3442 case 4: 3443 { 3444 PetscTruth sse_enabled_local; 3445 ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3446 if (use_natural) { 3447 #if defined(PETSC_USE_MAT_SINGLE) 3448 if (sse_enabled_local) { /* Natural + Single + SSE */ 3449 # if defined(PETSC_HAVE_SSE) 3450 int n=a->mbs; 3451 if (n==(unsigned short)n) { 3452 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj; 3453 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n"); 3454 } else { 3455 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 3456 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n"); 3457 } 3458 # else 3459 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3460 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3461 # endif 3462 } else { /* Natural + Single */ 3463 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3464 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n"); 3465 } 3466 #else 3467 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3468 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3469 #endif 3470 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3471 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3472 } else { /* Arbitrary ordering */ 3473 #if defined(PETSC_USE_MAT_SINGLE) 3474 if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3475 # if defined(PETSC_HAVE_SSE) 3476 A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3477 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"); 3478 # else 3479 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3480 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3481 # endif 3482 } else { /* Arbitrary + Single */ 3483 A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3484 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n"); 3485 } 3486 #else 3487 A->ops->solve = MatSolve_SeqBAIJ_4; 3488 #endif 3489 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3490 } 3491 } 3492 break; 3493 case 5: 3494 if (use_natural) { 3495 A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3496 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3497 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3498 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3499 } else { 3500 A->ops->solve = MatSolve_SeqBAIJ_5; 3501 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3502 } 3503 break; 3504 case 6: 3505 if (use_natural) { 3506 A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3507 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3508 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3509 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3510 } else { 3511 A->ops->solve = MatSolve_SeqBAIJ_6; 3512 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3513 } 3514 break; 3515 case 7: 3516 if (use_natural) { 3517 A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3518 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3519 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3520 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3521 } else { 3522 A->ops->solve = MatSolve_SeqBAIJ_7; 3523 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3524 } 3525 break; 3526 default: 3527 A->ops->solve = MatSolve_SeqBAIJ_N; 3528 break; 3529 } 3530 PetscFunctionReturn(0); 3531 } 3532 3533 #undef __FUNCT__ 3534 #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3535 PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3536 PetscErrorCode ierr; 3537 3538 PetscFunctionBegin; 3539 ierr = MatSeqBAIJ_UpdateSolvers(A); 3540 if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3541 ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3542 } else { 3543 SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3544 } 3545 PetscFunctionReturn(0); 3546 } 3547 3548 #undef __FUNCT__ 3549 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3550 PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3551 PetscErrorCode ierr; 3552 3553 PetscFunctionBegin; 3554 ierr = MatSeqBAIJ_UpdateSolvers(A); 3555 ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3556 PetscFunctionReturn(0); 3557 } 3558 3559 3560