1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 #include "petscbt.h" 11 #include "../src/mat/utils/freespace.h" 12 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 16 { 17 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18 PetscErrorCode ierr; 19 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20 PetscInt *diag = a->diag; 21 MatScalar *aa=a->a,*v; 22 PetscScalar s1,*x,*b; 23 24 PetscFunctionBegin; 25 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 26 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 29 /* forward solve the U^T */ 30 for (i=0; i<n; i++) { 31 32 v = aa + diag[i]; 33 /* multiply by the inverse of the block diagonal */ 34 s1 = (*v++)*x[i]; 35 vi = aj + diag[i] + 1; 36 nz = ai[i+1] - diag[i] - 1; 37 while (nz--) { 38 x[*vi++] -= (*v++)*s1; 39 } 40 x[i] = s1; 41 } 42 /* backward solve the L^T */ 43 for (i=n-1; i>=0; i--){ 44 v = aa + diag[i] - 1; 45 vi = aj + diag[i] - 1; 46 nz = diag[i] - ai[i]; 47 s1 = x[i]; 48 while (nz--) { 49 x[*vi--] -= (*v--)*s1; 50 } 51 } 52 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 53 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55 PetscFunctionReturn(0); 56 } 57 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63 PetscErrorCode ierr; 64 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65 PetscInt *diag = a->diag,oidx; 66 MatScalar *aa=a->a,*v; 67 PetscScalar s1,s2,x1,x2; 68 PetscScalar *x,*b; 69 70 PetscFunctionBegin; 71 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 73 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74 75 /* forward solve the U^T */ 76 idx = 0; 77 for (i=0; i<n; i++) { 78 79 v = aa + 4*diag[i]; 80 /* multiply by the inverse of the block diagonal */ 81 x1 = x[idx]; x2 = x[1+idx]; 82 s1 = v[0]*x1 + v[1]*x2; 83 s2 = v[2]*x1 + v[3]*x2; 84 v += 4; 85 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 oidx = 2*(*vi++); 90 x[oidx] -= v[0]*s1 + v[1]*s2; 91 x[oidx+1] -= v[2]*s1 + v[3]*s2; 92 v += 4; 93 } 94 x[idx] = s1;x[1+idx] = s2; 95 idx += 2; 96 } 97 /* backward solve the L^T */ 98 for (i=n-1; i>=0; i--){ 99 v = aa + 4*diag[i] - 4; 100 vi = aj + diag[i] - 1; 101 nz = diag[i] - ai[i]; 102 idt = 2*i; 103 s1 = x[idt]; s2 = x[1+idt]; 104 while (nz--) { 105 idx = 2*(*vi--); 106 x[idx] -= v[0]*s1 + v[1]*s2; 107 x[idx+1] -= v[2]*s1 + v[3]*s2; 108 v -= 4; 109 } 110 } 111 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 120 { 121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122 PetscErrorCode ierr; 123 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 124 PetscInt nz,idx,idt,j,i,oidx; 125 PetscInt bs=A->rmap->bs,bs2=a->bs2; 126 MatScalar *aa=a->a,*v; 127 PetscScalar s1,s2,x1,x2; 128 PetscScalar *x,*b; 129 130 PetscFunctionBegin; 131 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 132 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 133 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 134 135 /* forward solve the U^T */ 136 idx = 0; 137 for (i=0; i<n; i++) { 138 v = aa + bs2*diag[i]; 139 /* multiply by the inverse of the block diagonal */ 140 x1 = x[idx]; x2 = x[1+idx]; 141 s1 = v[0]*x1 + v[1]*x2; 142 s2 = v[2]*x1 + v[3]*x2; 143 v -= bs2; 144 145 vi = aj + diag[i] - 1; 146 nz = diag[i] - diag[i+1] - 1; 147 for(j=0;j>-nz;j--){ 148 oidx = bs*vi[j]; 149 x[oidx] -= v[0]*s1 + v[1]*s2; 150 x[oidx+1] -= v[2]*s1 + v[3]*s2; 151 v -= bs2; 152 } 153 x[idx] = s1;x[1+idx] = s2; 154 idx += bs; 155 } 156 /* backward solve the L^T */ 157 for (i=n-1; i>=0; i--){ 158 v = aa + bs2*ai[i]; 159 vi = aj + ai[i]; 160 nz = ai[i+1] - ai[i]; 161 idt = bs*i; 162 s1 = x[idt]; s2 = x[1+idt]; 163 for(j=0;j<nz;j++){ 164 idx = bs*vi[j]; 165 x[idx] -= v[0]*s1 + v[1]*s2; 166 x[idx+1] -= v[2]*s1 + v[3]*s2; 167 v += bs2; 168 } 169 } 170 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 172 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 179 { 180 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 181 PetscErrorCode ierr; 182 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 183 PetscInt *diag = a->diag,oidx; 184 MatScalar *aa=a->a,*v; 185 PetscScalar s1,s2,s3,x1,x2,x3; 186 PetscScalar *x,*b; 187 188 PetscFunctionBegin; 189 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 190 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 191 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192 193 /* forward solve the U^T */ 194 idx = 0; 195 for (i=0; i<n; i++) { 196 197 v = aa + 9*diag[i]; 198 /* multiply by the inverse of the block diagonal */ 199 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 200 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 201 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 202 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 203 v += 9; 204 205 vi = aj + diag[i] + 1; 206 nz = ai[i+1] - diag[i] - 1; 207 while (nz--) { 208 oidx = 3*(*vi++); 209 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 210 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 211 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 212 v += 9; 213 } 214 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 215 idx += 3; 216 } 217 /* backward solve the L^T */ 218 for (i=n-1; i>=0; i--){ 219 v = aa + 9*diag[i] - 9; 220 vi = aj + diag[i] - 1; 221 nz = diag[i] - ai[i]; 222 idt = 3*i; 223 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 224 while (nz--) { 225 idx = 3*(*vi--); 226 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 227 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 228 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 229 v -= 9; 230 } 231 } 232 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 233 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 240 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 241 { 242 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 243 PetscErrorCode ierr; 244 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 245 PetscInt nz,idx,idt,j,i,oidx; 246 PetscInt bs=A->rmap->bs,bs2=a->bs2; 247 MatScalar *aa=a->a,*v; 248 PetscScalar s1,s2,s3,x1,x2,x3; 249 PetscScalar *x,*b; 250 251 PetscFunctionBegin; 252 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 253 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 254 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 255 256 /* forward solve the U^T */ 257 idx = 0; 258 for (i=0; i<n; i++) { 259 v = aa + bs2*diag[i]; 260 /* multiply by the inverse of the block diagonal */ 261 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 262 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 263 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 264 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 265 v -= bs2; 266 267 vi = aj + diag[i] - 1; 268 nz = diag[i] - diag[i+1] - 1; 269 for(j=0;j>-nz;j--){ 270 oidx = bs*vi[j]; 271 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 272 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 273 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 274 v -= bs2; 275 } 276 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 277 idx += bs; 278 } 279 /* backward solve the L^T */ 280 for (i=n-1; i>=0; i--){ 281 v = aa + bs2*ai[i]; 282 vi = aj + ai[i]; 283 nz = ai[i+1] - ai[i]; 284 idt = bs*i; 285 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 286 for(j=0;j<nz;j++){ 287 idx = bs*vi[j]; 288 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 289 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 290 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 291 v += bs2; 292 } 293 } 294 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 295 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 296 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 297 PetscFunctionReturn(0); 298 } 299 300 #undef __FUNCT__ 301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 302 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 303 { 304 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 305 PetscErrorCode ierr; 306 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 307 PetscInt *diag = a->diag,oidx; 308 MatScalar *aa=a->a,*v; 309 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 310 PetscScalar *x,*b; 311 312 PetscFunctionBegin; 313 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 314 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 315 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 316 317 /* forward solve the U^T */ 318 idx = 0; 319 for (i=0; i<n; i++) { 320 321 v = aa + 16*diag[i]; 322 /* multiply by the inverse of the block diagonal */ 323 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 324 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 325 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 326 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 327 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 328 v += 16; 329 330 vi = aj + diag[i] + 1; 331 nz = ai[i+1] - diag[i] - 1; 332 while (nz--) { 333 oidx = 4*(*vi++); 334 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 335 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 336 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 337 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 338 v += 16; 339 } 340 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 341 idx += 4; 342 } 343 /* backward solve the L^T */ 344 for (i=n-1; i>=0; i--){ 345 v = aa + 16*diag[i] - 16; 346 vi = aj + diag[i] - 1; 347 nz = diag[i] - ai[i]; 348 idt = 4*i; 349 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 350 while (nz--) { 351 idx = 4*(*vi--); 352 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 353 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 354 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 355 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 356 v -= 16; 357 } 358 } 359 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 360 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 361 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 362 PetscFunctionReturn(0); 363 } 364 365 #undef __FUNCT__ 366 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 367 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 368 { 369 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 370 PetscErrorCode ierr; 371 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 372 PetscInt nz,idx,idt,j,i,oidx; 373 PetscInt bs=A->rmap->bs,bs2=a->bs2; 374 MatScalar *aa=a->a,*v; 375 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 376 PetscScalar *x,*b; 377 378 PetscFunctionBegin; 379 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 380 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 381 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 382 383 /* forward solve the U^T */ 384 idx = 0; 385 for (i=0; i<n; i++) { 386 v = aa + bs2*diag[i]; 387 /* multiply by the inverse of the block diagonal */ 388 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 389 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 390 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 391 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 392 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 393 v -= bs2; 394 395 vi = aj + diag[i] - 1; 396 nz = diag[i] - diag[i+1] - 1; 397 for(j=0;j>-nz;j--){ 398 oidx = bs*vi[j]; 399 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 400 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 401 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 402 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 403 v -= bs2; 404 } 405 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 406 idx += bs; 407 } 408 /* backward solve the L^T */ 409 for (i=n-1; i>=0; i--){ 410 v = aa + bs2*ai[i]; 411 vi = aj + ai[i]; 412 nz = ai[i+1] - ai[i]; 413 idt = bs*i; 414 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 415 for(j=0;j<nz;j++){ 416 idx = bs*vi[j]; 417 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 418 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 419 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 420 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 421 v += bs2; 422 } 423 } 424 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 425 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 426 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 427 PetscFunctionReturn(0); 428 } 429 430 #undef __FUNCT__ 431 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 432 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 433 { 434 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 435 PetscErrorCode ierr; 436 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 437 PetscInt *diag = a->diag,oidx; 438 MatScalar *aa=a->a,*v; 439 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 440 PetscScalar *x,*b; 441 442 PetscFunctionBegin; 443 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 444 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 445 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 446 447 /* forward solve the U^T */ 448 idx = 0; 449 for (i=0; i<n; i++) { 450 451 v = aa + 25*diag[i]; 452 /* multiply by the inverse of the block diagonal */ 453 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 454 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 455 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 456 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 457 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 458 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 459 v += 25; 460 461 vi = aj + diag[i] + 1; 462 nz = ai[i+1] - diag[i] - 1; 463 while (nz--) { 464 oidx = 5*(*vi++); 465 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 466 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 467 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 468 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 469 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 470 v += 25; 471 } 472 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 473 idx += 5; 474 } 475 /* backward solve the L^T */ 476 for (i=n-1; i>=0; i--){ 477 v = aa + 25*diag[i] - 25; 478 vi = aj + diag[i] - 1; 479 nz = diag[i] - ai[i]; 480 idt = 5*i; 481 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 482 while (nz--) { 483 idx = 5*(*vi--); 484 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 485 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 486 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 487 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 488 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 489 v -= 25; 490 } 491 } 492 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 493 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 494 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 #undef __FUNCT__ 499 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 500 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 501 { 502 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 503 PetscErrorCode ierr; 504 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 505 PetscInt nz,idx,idt,j,i,oidx; 506 PetscInt bs=A->rmap->bs,bs2=a->bs2; 507 MatScalar *aa=a->a,*v; 508 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 509 PetscScalar *x,*b; 510 511 PetscFunctionBegin; 512 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 513 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 514 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 515 516 /* forward solve the U^T */ 517 idx = 0; 518 for (i=0; i<n; i++) { 519 v = aa + bs2*diag[i]; 520 /* multiply by the inverse of the block diagonal */ 521 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 522 x5 = x[4+idx]; 523 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 524 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 525 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 526 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 527 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 528 v -= bs2; 529 530 vi = aj + diag[i] - 1; 531 nz = diag[i] - diag[i+1] - 1; 532 for(j=0;j>-nz;j--){ 533 oidx = bs*vi[j]; 534 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 535 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 536 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 537 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 538 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 539 v -= bs2; 540 } 541 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 542 idx += bs; 543 } 544 /* backward solve the L^T */ 545 for (i=n-1; i>=0; i--){ 546 v = aa + bs2*ai[i]; 547 vi = aj + ai[i]; 548 nz = ai[i+1] - ai[i]; 549 idt = bs*i; 550 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 551 for(j=0;j<nz;j++){ 552 idx = bs*vi[j]; 553 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 554 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 555 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 556 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 557 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 558 v += bs2; 559 } 560 } 561 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 562 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 563 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 564 PetscFunctionReturn(0); 565 } 566 567 #undef __FUNCT__ 568 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 569 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 570 { 571 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 572 PetscErrorCode ierr; 573 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 574 PetscInt *diag = a->diag,oidx; 575 MatScalar *aa=a->a,*v; 576 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 577 PetscScalar *x,*b; 578 579 PetscFunctionBegin; 580 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 581 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 582 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 583 584 /* forward solve the U^T */ 585 idx = 0; 586 for (i=0; i<n; i++) { 587 588 v = aa + 36*diag[i]; 589 /* multiply by the inverse of the block diagonal */ 590 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 591 x6 = x[5+idx]; 592 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 593 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 594 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 595 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 596 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 597 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 598 v += 36; 599 600 vi = aj + diag[i] + 1; 601 nz = ai[i+1] - diag[i] - 1; 602 while (nz--) { 603 oidx = 6*(*vi++); 604 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 605 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 606 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 607 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 608 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 609 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 610 v += 36; 611 } 612 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 613 x[5+idx] = s6; 614 idx += 6; 615 } 616 /* backward solve the L^T */ 617 for (i=n-1; i>=0; i--){ 618 v = aa + 36*diag[i] - 36; 619 vi = aj + diag[i] - 1; 620 nz = diag[i] - ai[i]; 621 idt = 6*i; 622 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 623 s6 = x[5+idt]; 624 while (nz--) { 625 idx = 6*(*vi--); 626 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632 v -= 36; 633 } 634 } 635 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 636 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 637 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 638 PetscFunctionReturn(0); 639 } 640 641 #undef __FUNCT__ 642 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 643 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 644 { 645 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 646 PetscErrorCode ierr; 647 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 648 PetscInt nz,idx,idt,j,i,oidx; 649 PetscInt bs=A->rmap->bs,bs2=a->bs2; 650 MatScalar *aa=a->a,*v; 651 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 652 PetscScalar *x,*b; 653 654 PetscFunctionBegin; 655 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 656 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 657 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 658 659 /* forward solve the U^T */ 660 idx = 0; 661 for (i=0; i<n; i++) { 662 v = aa + bs2*diag[i]; 663 /* multiply by the inverse of the block diagonal */ 664 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 665 x5 = x[4+idx]; x6 = x[5+idx]; 666 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 667 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 668 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 669 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 670 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 671 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 672 v -= bs2; 673 674 vi = aj + diag[i] - 1; 675 nz = diag[i] - diag[i+1] - 1; 676 for(j=0;j>-nz;j--){ 677 oidx = bs*vi[j]; 678 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 679 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 680 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 681 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 682 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 683 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 684 v -= bs2; 685 } 686 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 687 x[5+idx] = s6; 688 idx += bs; 689 } 690 /* backward solve the L^T */ 691 for (i=n-1; i>=0; i--){ 692 v = aa + bs2*ai[i]; 693 vi = aj + ai[i]; 694 nz = ai[i+1] - ai[i]; 695 idt = bs*i; 696 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 697 s6 = x[5+idt]; 698 for(j=0;j<nz;j++){ 699 idx = bs*vi[j]; 700 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 701 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 702 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 703 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 704 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 705 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 706 v += bs2; 707 } 708 } 709 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 710 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 711 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 712 PetscFunctionReturn(0); 713 } 714 715 #undef __FUNCT__ 716 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 717 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 718 { 719 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 720 PetscErrorCode ierr; 721 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 722 PetscInt *diag = a->diag,oidx; 723 MatScalar *aa=a->a,*v; 724 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 725 PetscScalar *x,*b; 726 727 PetscFunctionBegin; 728 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 729 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 730 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 731 732 /* forward solve the U^T */ 733 idx = 0; 734 for (i=0; i<n; i++) { 735 736 v = aa + 49*diag[i]; 737 /* multiply by the inverse of the block diagonal */ 738 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 739 x6 = x[5+idx]; x7 = x[6+idx]; 740 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 741 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 742 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 743 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 744 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 745 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 746 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 747 v += 49; 748 749 vi = aj + diag[i] + 1; 750 nz = ai[i+1] - diag[i] - 1; 751 while (nz--) { 752 oidx = 7*(*vi++); 753 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 754 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 755 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 756 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 757 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 758 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 759 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 760 v += 49; 761 } 762 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 763 x[5+idx] = s6;x[6+idx] = s7; 764 idx += 7; 765 } 766 /* backward solve the L^T */ 767 for (i=n-1; i>=0; i--){ 768 v = aa + 49*diag[i] - 49; 769 vi = aj + diag[i] - 1; 770 nz = diag[i] - ai[i]; 771 idt = 7*i; 772 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 773 s6 = x[5+idt];s7 = x[6+idt]; 774 while (nz--) { 775 idx = 7*(*vi--); 776 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 777 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 778 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 779 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 780 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 781 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 782 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 783 v -= 49; 784 } 785 } 786 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 787 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 788 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 789 PetscFunctionReturn(0); 790 } 791 #undef __FUNCT__ 792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 793 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 794 { 795 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 796 PetscErrorCode ierr; 797 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 798 PetscInt nz,idx,idt,j,i,oidx; 799 PetscInt bs=A->rmap->bs,bs2=a->bs2; 800 MatScalar *aa=a->a,*v; 801 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 802 PetscScalar *x,*b; 803 804 PetscFunctionBegin; 805 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 806 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 807 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 808 809 /* forward solve the U^T */ 810 idx = 0; 811 for (i=0; i<n; i++) { 812 v = aa + bs2*diag[i]; 813 /* multiply by the inverse of the block diagonal */ 814 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 815 x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 816 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 817 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 818 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 819 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 820 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 821 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 822 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 823 v -= bs2; 824 vi = aj + diag[i] - 1; 825 nz = diag[i] - diag[i+1] - 1; 826 for(j=0;j>-nz;j--){ 827 oidx = bs*vi[j]; 828 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 829 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 830 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 831 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 832 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 833 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 834 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 835 v -= bs2; 836 } 837 x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 838 x[5+idx] = s6; x[6+idx] = s7; 839 idx += bs; 840 } 841 /* backward solve the L^T */ 842 for (i=n-1; i>=0; i--){ 843 v = aa + bs2*ai[i]; 844 vi = aj + ai[i]; 845 nz = ai[i+1] - ai[i]; 846 idt = bs*i; 847 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 848 s6 = x[5+idt]; s7 = x[6+idt]; 849 for(j=0;j<nz;j++){ 850 idx = bs*vi[j]; 851 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 852 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 853 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 854 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 855 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 856 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 857 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 858 v += bs2; 859 } 860 } 861 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 862 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 863 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 864 PetscFunctionReturn(0); 865 } 866 867 /*---------------------------------------------------------------------------------------------*/ 868 #undef __FUNCT__ 869 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 870 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 871 { 872 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 873 IS iscol=a->col,isrow=a->row; 874 PetscErrorCode ierr; 875 const PetscInt *r,*c,*rout,*cout; 876 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 877 PetscInt *diag = a->diag; 878 MatScalar *aa=a->a,*v; 879 PetscScalar s1,*x,*b,*t; 880 881 PetscFunctionBegin; 882 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 883 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 884 t = a->solve_work; 885 886 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 887 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 888 889 /* copy the b into temp work space according to permutation */ 890 for (i=0; i<n; i++) { 891 t[i] = b[c[i]]; 892 } 893 894 /* forward solve the U^T */ 895 for (i=0; i<n; i++) { 896 897 v = aa + diag[i]; 898 /* multiply by the inverse of the block diagonal */ 899 s1 = (*v++)*t[i]; 900 vi = aj + diag[i] + 1; 901 nz = ai[i+1] - diag[i] - 1; 902 while (nz--) { 903 t[*vi++] -= (*v++)*s1; 904 } 905 t[i] = s1; 906 } 907 /* backward solve the L^T */ 908 for (i=n-1; i>=0; i--){ 909 v = aa + diag[i] - 1; 910 vi = aj + diag[i] - 1; 911 nz = diag[i] - ai[i]; 912 s1 = t[i]; 913 while (nz--) { 914 t[*vi--] -= (*v--)*s1; 915 } 916 } 917 918 /* copy t into x according to permutation */ 919 for (i=0; i<n; i++) { 920 x[r[i]] = t[i]; 921 } 922 923 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 924 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 925 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 926 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 927 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 928 PetscFunctionReturn(0); 929 } 930 931 #undef __FUNCT__ 932 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 933 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 934 { 935 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 936 IS iscol=a->col,isrow=a->row; 937 PetscErrorCode ierr; 938 const PetscInt *r,*c,*rout,*cout; 939 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 940 PetscInt *diag = a->diag,ii,ic,ir,oidx; 941 MatScalar *aa=a->a,*v; 942 PetscScalar s1,s2,x1,x2; 943 PetscScalar *x,*b,*t; 944 945 PetscFunctionBegin; 946 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 947 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 948 t = a->solve_work; 949 950 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 951 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 952 953 /* copy the b into temp work space according to permutation */ 954 ii = 0; 955 for (i=0; i<n; i++) { 956 ic = 2*c[i]; 957 t[ii] = b[ic]; 958 t[ii+1] = b[ic+1]; 959 ii += 2; 960 } 961 962 /* forward solve the U^T */ 963 idx = 0; 964 for (i=0; i<n; i++) { 965 966 v = aa + 4*diag[i]; 967 /* multiply by the inverse of the block diagonal */ 968 x1 = t[idx]; x2 = t[1+idx]; 969 s1 = v[0]*x1 + v[1]*x2; 970 s2 = v[2]*x1 + v[3]*x2; 971 v += 4; 972 973 vi = aj + diag[i] + 1; 974 nz = ai[i+1] - diag[i] - 1; 975 while (nz--) { 976 oidx = 2*(*vi++); 977 t[oidx] -= v[0]*s1 + v[1]*s2; 978 t[oidx+1] -= v[2]*s1 + v[3]*s2; 979 v += 4; 980 } 981 t[idx] = s1;t[1+idx] = s2; 982 idx += 2; 983 } 984 /* backward solve the L^T */ 985 for (i=n-1; i>=0; i--){ 986 v = aa + 4*diag[i] - 4; 987 vi = aj + diag[i] - 1; 988 nz = diag[i] - ai[i]; 989 idt = 2*i; 990 s1 = t[idt]; s2 = t[1+idt]; 991 while (nz--) { 992 idx = 2*(*vi--); 993 t[idx] -= v[0]*s1 + v[1]*s2; 994 t[idx+1] -= v[2]*s1 + v[3]*s2; 995 v -= 4; 996 } 997 } 998 999 /* copy t into x according to permutation */ 1000 ii = 0; 1001 for (i=0; i<n; i++) { 1002 ir = 2*r[i]; 1003 x[ir] = t[ii]; 1004 x[ir+1] = t[ii+1]; 1005 ii += 2; 1006 } 1007 1008 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1009 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1010 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1011 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1012 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1013 PetscFunctionReturn(0); 1014 } 1015 1016 #undef __FUNCT__ 1017 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 1018 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1019 { 1020 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1021 PetscErrorCode ierr; 1022 IS iscol=a->col,isrow=a->row; 1023 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1024 const PetscInt *r,*c,*rout,*cout; 1025 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1026 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1027 MatScalar *aa=a->a,*v; 1028 PetscScalar s1,s2,x1,x2; 1029 PetscScalar *x,*b,*t; 1030 1031 PetscFunctionBegin; 1032 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1033 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1034 t = a->solve_work; 1035 1036 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1037 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1038 1039 /* copy b into temp work space according to permutation */ 1040 for(i=0;i<n;i++){ 1041 ii = bs*i; ic = bs*c[i]; 1042 t[ii] = b[ic]; t[ii+1] = b[ic+1]; 1043 } 1044 1045 /* forward solve the U^T */ 1046 idx = 0; 1047 for (i=0; i<n; i++) { 1048 v = aa + bs2*diag[i]; 1049 /* multiply by the inverse of the block diagonal */ 1050 x1 = t[idx]; x2 = t[1+idx]; 1051 s1 = v[0]*x1 + v[1]*x2; 1052 s2 = v[2]*x1 + v[3]*x2; 1053 v -= bs2; 1054 1055 vi = aj + diag[i] - 1; 1056 nz = diag[i] - diag[i+1] - 1; 1057 for(j=0;j>-nz;j--){ 1058 oidx = bs*vi[j]; 1059 t[oidx] -= v[0]*s1 + v[1]*s2; 1060 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1061 v -= bs2; 1062 } 1063 t[idx] = s1;t[1+idx] = s2; 1064 idx += bs; 1065 } 1066 /* backward solve the L^T */ 1067 for (i=n-1; i>=0; i--){ 1068 v = aa + bs2*ai[i]; 1069 vi = aj + ai[i]; 1070 nz = ai[i+1] - ai[i]; 1071 idt = bs*i; 1072 s1 = t[idt]; s2 = t[1+idt]; 1073 for(j=0;j<nz;j++){ 1074 idx = bs*vi[j]; 1075 t[idx] -= v[0]*s1 + v[1]*s2; 1076 t[idx+1] -= v[2]*s1 + v[3]*s2; 1077 v += bs2; 1078 } 1079 } 1080 1081 /* copy t into x according to permutation */ 1082 for(i=0;i<n;i++){ 1083 ii = bs*i; ir = bs*r[i]; 1084 x[ir] = t[ii]; x[ir+1] = t[ii+1]; 1085 } 1086 1087 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1088 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1089 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1090 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1091 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1092 PetscFunctionReturn(0); 1093 } 1094 1095 #undef __FUNCT__ 1096 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 1097 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1098 { 1099 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1100 IS iscol=a->col,isrow=a->row; 1101 PetscErrorCode ierr; 1102 const PetscInt *r,*c,*rout,*cout; 1103 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1104 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1105 MatScalar *aa=a->a,*v; 1106 PetscScalar s1,s2,s3,x1,x2,x3; 1107 PetscScalar *x,*b,*t; 1108 1109 PetscFunctionBegin; 1110 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1111 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1112 t = a->solve_work; 1113 1114 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1115 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1116 1117 /* copy the b into temp work space according to permutation */ 1118 ii = 0; 1119 for (i=0; i<n; i++) { 1120 ic = 3*c[i]; 1121 t[ii] = b[ic]; 1122 t[ii+1] = b[ic+1]; 1123 t[ii+2] = b[ic+2]; 1124 ii += 3; 1125 } 1126 1127 /* forward solve the U^T */ 1128 idx = 0; 1129 for (i=0; i<n; i++) { 1130 1131 v = aa + 9*diag[i]; 1132 /* multiply by the inverse of the block diagonal */ 1133 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1134 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1135 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1136 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1137 v += 9; 1138 1139 vi = aj + diag[i] + 1; 1140 nz = ai[i+1] - diag[i] - 1; 1141 while (nz--) { 1142 oidx = 3*(*vi++); 1143 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1144 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1145 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1146 v += 9; 1147 } 1148 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1149 idx += 3; 1150 } 1151 /* backward solve the L^T */ 1152 for (i=n-1; i>=0; i--){ 1153 v = aa + 9*diag[i] - 9; 1154 vi = aj + diag[i] - 1; 1155 nz = diag[i] - ai[i]; 1156 idt = 3*i; 1157 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1158 while (nz--) { 1159 idx = 3*(*vi--); 1160 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1161 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1162 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1163 v -= 9; 1164 } 1165 } 1166 1167 /* copy t into x according to permutation */ 1168 ii = 0; 1169 for (i=0; i<n; i++) { 1170 ir = 3*r[i]; 1171 x[ir] = t[ii]; 1172 x[ir+1] = t[ii+1]; 1173 x[ir+2] = t[ii+2]; 1174 ii += 3; 1175 } 1176 1177 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1178 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1179 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1180 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1181 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1182 PetscFunctionReturn(0); 1183 } 1184 1185 #undef __FUNCT__ 1186 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 1187 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1188 { 1189 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1190 PetscErrorCode ierr; 1191 IS iscol=a->col,isrow=a->row; 1192 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1193 const PetscInt *r,*c,*rout,*cout; 1194 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1195 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1196 MatScalar *aa=a->a,*v; 1197 PetscScalar s1,s2,s3,x1,x2,x3; 1198 PetscScalar *x,*b,*t; 1199 1200 PetscFunctionBegin; 1201 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1202 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1203 t = a->solve_work; 1204 1205 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1206 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1207 1208 /* copy b into temp work space according to permutation */ 1209 for(i=0;i<n;i++){ 1210 ii = bs*i; ic = bs*c[i]; 1211 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 1212 } 1213 1214 /* forward solve the U^T */ 1215 idx = 0; 1216 for (i=0; i<n; i++) { 1217 v = aa + bs2*diag[i]; 1218 /* multiply by the inverse of the block diagonal */ 1219 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1220 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1221 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1222 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1223 v -= bs2; 1224 1225 vi = aj + diag[i] - 1; 1226 nz = diag[i] - diag[i+1] - 1; 1227 for(j=0;j>-nz;j--){ 1228 oidx = bs*vi[j]; 1229 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1230 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1231 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1232 v -= bs2; 1233 } 1234 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1235 idx += bs; 1236 } 1237 /* backward solve the L^T */ 1238 for (i=n-1; i>=0; i--){ 1239 v = aa + bs2*ai[i]; 1240 vi = aj + ai[i]; 1241 nz = ai[i+1] - ai[i]; 1242 idt = bs*i; 1243 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1244 for(j=0;j<nz;j++){ 1245 idx = bs*vi[j]; 1246 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1247 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1248 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1249 v += bs2; 1250 } 1251 } 1252 1253 /* copy t into x according to permutation */ 1254 for(i=0;i<n;i++){ 1255 ii = bs*i; ir = bs*r[i]; 1256 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 1257 } 1258 1259 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1260 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1261 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1262 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1263 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1264 PetscFunctionReturn(0); 1265 } 1266 1267 #undef __FUNCT__ 1268 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 1269 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1270 { 1271 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1272 IS iscol=a->col,isrow=a->row; 1273 PetscErrorCode ierr; 1274 const PetscInt *r,*c,*rout,*cout; 1275 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1276 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1277 MatScalar *aa=a->a,*v; 1278 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 1279 PetscScalar *x,*b,*t; 1280 1281 PetscFunctionBegin; 1282 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1283 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1284 t = a->solve_work; 1285 1286 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1287 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1288 1289 /* copy the b into temp work space according to permutation */ 1290 ii = 0; 1291 for (i=0; i<n; i++) { 1292 ic = 4*c[i]; 1293 t[ii] = b[ic]; 1294 t[ii+1] = b[ic+1]; 1295 t[ii+2] = b[ic+2]; 1296 t[ii+3] = b[ic+3]; 1297 ii += 4; 1298 } 1299 1300 /* forward solve the U^T */ 1301 idx = 0; 1302 for (i=0; i<n; i++) { 1303 1304 v = aa + 16*diag[i]; 1305 /* multiply by the inverse of the block diagonal */ 1306 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1307 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1308 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1309 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1310 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1311 v += 16; 1312 1313 vi = aj + diag[i] + 1; 1314 nz = ai[i+1] - diag[i] - 1; 1315 while (nz--) { 1316 oidx = 4*(*vi++); 1317 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1318 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1319 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1320 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1321 v += 16; 1322 } 1323 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1324 idx += 4; 1325 } 1326 /* backward solve the L^T */ 1327 for (i=n-1; i>=0; i--){ 1328 v = aa + 16*diag[i] - 16; 1329 vi = aj + diag[i] - 1; 1330 nz = diag[i] - ai[i]; 1331 idt = 4*i; 1332 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1333 while (nz--) { 1334 idx = 4*(*vi--); 1335 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1336 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1337 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1338 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1339 v -= 16; 1340 } 1341 } 1342 1343 /* copy t into x according to permutation */ 1344 ii = 0; 1345 for (i=0; i<n; i++) { 1346 ir = 4*r[i]; 1347 x[ir] = t[ii]; 1348 x[ir+1] = t[ii+1]; 1349 x[ir+2] = t[ii+2]; 1350 x[ir+3] = t[ii+3]; 1351 ii += 4; 1352 } 1353 1354 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1355 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1356 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1357 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1358 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1359 PetscFunctionReturn(0); 1360 } 1361 1362 #undef __FUNCT__ 1363 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 1364 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1365 { 1366 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1367 PetscErrorCode ierr; 1368 IS iscol=a->col,isrow=a->row; 1369 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1370 const PetscInt *r,*c,*rout,*cout; 1371 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1372 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1373 MatScalar *aa=a->a,*v; 1374 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 1375 PetscScalar *x,*b,*t; 1376 1377 PetscFunctionBegin; 1378 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1379 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1380 t = a->solve_work; 1381 1382 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1383 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1384 1385 /* copy b into temp work space according to permutation */ 1386 for(i=0;i<n;i++){ 1387 ii = bs*i; ic = bs*c[i]; 1388 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1389 } 1390 1391 /* forward solve the U^T */ 1392 idx = 0; 1393 for (i=0; i<n; i++) { 1394 v = aa + bs2*diag[i]; 1395 /* multiply by the inverse of the block diagonal */ 1396 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1397 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1398 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1399 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1400 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1401 v -= bs2; 1402 1403 vi = aj + diag[i] - 1; 1404 nz = diag[i] - diag[i+1] - 1; 1405 for(j=0;j>-nz;j--){ 1406 oidx = bs*vi[j]; 1407 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1408 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1409 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1410 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1411 v -= bs2; 1412 } 1413 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 1414 idx += bs; 1415 } 1416 /* backward solve the L^T */ 1417 for (i=n-1; i>=0; i--){ 1418 v = aa + bs2*ai[i]; 1419 vi = aj + ai[i]; 1420 nz = ai[i+1] - ai[i]; 1421 idt = bs*i; 1422 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 1423 for(j=0;j<nz;j++){ 1424 idx = bs*vi[j]; 1425 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1426 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1427 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1428 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1429 v += bs2; 1430 } 1431 } 1432 1433 /* copy t into x according to permutation */ 1434 for(i=0;i<n;i++){ 1435 ii = bs*i; ir = bs*r[i]; 1436 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1437 } 1438 1439 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1440 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1441 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1442 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1443 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1444 PetscFunctionReturn(0); 1445 } 1446 1447 #undef __FUNCT__ 1448 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 1449 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1450 { 1451 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1452 IS iscol=a->col,isrow=a->row; 1453 PetscErrorCode ierr; 1454 const PetscInt *r,*c,*rout,*cout; 1455 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1456 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1457 MatScalar *aa=a->a,*v; 1458 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1459 PetscScalar *x,*b,*t; 1460 1461 PetscFunctionBegin; 1462 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1463 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1464 t = a->solve_work; 1465 1466 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1467 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1468 1469 /* copy the b into temp work space according to permutation */ 1470 ii = 0; 1471 for (i=0; i<n; i++) { 1472 ic = 5*c[i]; 1473 t[ii] = b[ic]; 1474 t[ii+1] = b[ic+1]; 1475 t[ii+2] = b[ic+2]; 1476 t[ii+3] = b[ic+3]; 1477 t[ii+4] = b[ic+4]; 1478 ii += 5; 1479 } 1480 1481 /* forward solve the U^T */ 1482 idx = 0; 1483 for (i=0; i<n; i++) { 1484 1485 v = aa + 25*diag[i]; 1486 /* multiply by the inverse of the block diagonal */ 1487 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1488 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1489 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1490 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1491 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1492 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1493 v += 25; 1494 1495 vi = aj + diag[i] + 1; 1496 nz = ai[i+1] - diag[i] - 1; 1497 while (nz--) { 1498 oidx = 5*(*vi++); 1499 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1500 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1501 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1502 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1503 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1504 v += 25; 1505 } 1506 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1507 idx += 5; 1508 } 1509 /* backward solve the L^T */ 1510 for (i=n-1; i>=0; i--){ 1511 v = aa + 25*diag[i] - 25; 1512 vi = aj + diag[i] - 1; 1513 nz = diag[i] - ai[i]; 1514 idt = 5*i; 1515 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1516 while (nz--) { 1517 idx = 5*(*vi--); 1518 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1519 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1520 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1521 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1522 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1523 v -= 25; 1524 } 1525 } 1526 1527 /* copy t into x according to permutation */ 1528 ii = 0; 1529 for (i=0; i<n; i++) { 1530 ir = 5*r[i]; 1531 x[ir] = t[ii]; 1532 x[ir+1] = t[ii+1]; 1533 x[ir+2] = t[ii+2]; 1534 x[ir+3] = t[ii+3]; 1535 x[ir+4] = t[ii+4]; 1536 ii += 5; 1537 } 1538 1539 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1540 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1541 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1542 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1543 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1544 PetscFunctionReturn(0); 1545 } 1546 1547 #undef __FUNCT__ 1548 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 1549 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1550 { 1551 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1552 PetscErrorCode ierr; 1553 IS iscol=a->col,isrow=a->row; 1554 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1555 const PetscInt *r,*c,*rout,*cout; 1556 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1557 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1558 MatScalar *aa=a->a,*v; 1559 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1560 PetscScalar *x,*b,*t; 1561 1562 PetscFunctionBegin; 1563 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1564 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1565 t = a->solve_work; 1566 1567 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1568 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1569 1570 /* copy b into temp work space according to permutation */ 1571 for(i=0;i<n;i++){ 1572 ii = bs*i; ic = bs*c[i]; 1573 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1574 t[ii+4] = b[ic+4]; 1575 } 1576 1577 /* forward solve the U^T */ 1578 idx = 0; 1579 for (i=0; i<n; i++) { 1580 v = aa + bs2*diag[i]; 1581 /* multiply by the inverse of the block diagonal */ 1582 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1583 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1584 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1585 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1586 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1587 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1588 v -= bs2; 1589 1590 vi = aj + diag[i] - 1; 1591 nz = diag[i] - diag[i+1] - 1; 1592 for(j=0;j>-nz;j--){ 1593 oidx = bs*vi[j]; 1594 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1595 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1596 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1597 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1598 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1599 v -= bs2; 1600 } 1601 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1602 idx += bs; 1603 } 1604 /* backward solve the L^T */ 1605 for (i=n-1; i>=0; i--){ 1606 v = aa + bs2*ai[i]; 1607 vi = aj + ai[i]; 1608 nz = ai[i+1] - ai[i]; 1609 idt = bs*i; 1610 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1611 for(j=0;j<nz;j++){ 1612 idx = bs*vi[j]; 1613 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1614 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1615 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1616 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1617 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1618 v += bs2; 1619 } 1620 } 1621 1622 /* copy t into x according to permutation */ 1623 for(i=0;i<n;i++){ 1624 ii = bs*i; ir = bs*r[i]; 1625 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1626 x[ir+4] = t[ii+4]; 1627 } 1628 1629 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1630 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1631 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1632 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1633 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1634 PetscFunctionReturn(0); 1635 } 1636 1637 #undef __FUNCT__ 1638 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 1639 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1640 { 1641 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1642 IS iscol=a->col,isrow=a->row; 1643 PetscErrorCode ierr; 1644 const PetscInt *r,*c,*rout,*cout; 1645 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1646 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1647 MatScalar *aa=a->a,*v; 1648 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1649 PetscScalar *x,*b,*t; 1650 1651 PetscFunctionBegin; 1652 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1653 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1654 t = a->solve_work; 1655 1656 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1657 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1658 1659 /* copy the b into temp work space according to permutation */ 1660 ii = 0; 1661 for (i=0; i<n; i++) { 1662 ic = 6*c[i]; 1663 t[ii] = b[ic]; 1664 t[ii+1] = b[ic+1]; 1665 t[ii+2] = b[ic+2]; 1666 t[ii+3] = b[ic+3]; 1667 t[ii+4] = b[ic+4]; 1668 t[ii+5] = b[ic+5]; 1669 ii += 6; 1670 } 1671 1672 /* forward solve the U^T */ 1673 idx = 0; 1674 for (i=0; i<n; i++) { 1675 1676 v = aa + 36*diag[i]; 1677 /* multiply by the inverse of the block diagonal */ 1678 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1679 x6 = t[5+idx]; 1680 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1681 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1682 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1683 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1684 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1685 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1686 v += 36; 1687 1688 vi = aj + diag[i] + 1; 1689 nz = ai[i+1] - diag[i] - 1; 1690 while (nz--) { 1691 oidx = 6*(*vi++); 1692 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1693 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1694 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1695 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1696 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1697 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1698 v += 36; 1699 } 1700 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1701 t[5+idx] = s6; 1702 idx += 6; 1703 } 1704 /* backward solve the L^T */ 1705 for (i=n-1; i>=0; i--){ 1706 v = aa + 36*diag[i] - 36; 1707 vi = aj + diag[i] - 1; 1708 nz = diag[i] - ai[i]; 1709 idt = 6*i; 1710 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1711 s6 = t[5+idt]; 1712 while (nz--) { 1713 idx = 6*(*vi--); 1714 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1715 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1716 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1717 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1718 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1719 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1720 v -= 36; 1721 } 1722 } 1723 1724 /* copy t into x according to permutation */ 1725 ii = 0; 1726 for (i=0; i<n; i++) { 1727 ir = 6*r[i]; 1728 x[ir] = t[ii]; 1729 x[ir+1] = t[ii+1]; 1730 x[ir+2] = t[ii+2]; 1731 x[ir+3] = t[ii+3]; 1732 x[ir+4] = t[ii+4]; 1733 x[ir+5] = t[ii+5]; 1734 ii += 6; 1735 } 1736 1737 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1738 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1739 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1740 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1741 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1742 PetscFunctionReturn(0); 1743 } 1744 1745 #undef __FUNCT__ 1746 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 1747 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1748 { 1749 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1750 PetscErrorCode ierr; 1751 IS iscol=a->col,isrow=a->row; 1752 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1753 const PetscInt *r,*c,*rout,*cout; 1754 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1755 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1756 MatScalar *aa=a->a,*v; 1757 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1758 PetscScalar *x,*b,*t; 1759 1760 PetscFunctionBegin; 1761 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1762 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1763 t = a->solve_work; 1764 1765 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1766 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1767 1768 /* copy b into temp work space according to permutation */ 1769 for(i=0;i<n;i++){ 1770 ii = bs*i; ic = bs*c[i]; 1771 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1772 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 1773 } 1774 1775 /* forward solve the U^T */ 1776 idx = 0; 1777 for (i=0; i<n; i++) { 1778 v = aa + bs2*diag[i]; 1779 /* multiply by the inverse of the block diagonal */ 1780 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1781 x6 = t[5+idx]; 1782 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1783 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1784 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1785 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1786 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1787 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1788 v -= bs2; 1789 1790 vi = aj + diag[i] - 1; 1791 nz = diag[i] - diag[i+1] - 1; 1792 for(j=0;j>-nz;j--){ 1793 oidx = bs*vi[j]; 1794 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1795 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1796 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1797 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1798 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1799 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1800 v -= bs2; 1801 } 1802 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1803 t[5+idx] = s6; 1804 idx += bs; 1805 } 1806 /* backward solve the L^T */ 1807 for (i=n-1; i>=0; i--){ 1808 v = aa + bs2*ai[i]; 1809 vi = aj + ai[i]; 1810 nz = ai[i+1] - ai[i]; 1811 idt = bs*i; 1812 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1813 s6 = t[5+idt]; 1814 for(j=0;j<nz;j++){ 1815 idx = bs*vi[j]; 1816 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1817 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1818 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1819 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1820 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1821 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1822 v += bs2; 1823 } 1824 } 1825 1826 /* copy t into x according to permutation */ 1827 for(i=0;i<n;i++){ 1828 ii = bs*i; ir = bs*r[i]; 1829 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1830 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 1831 } 1832 1833 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1834 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1835 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1836 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1837 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1838 PetscFunctionReturn(0); 1839 } 1840 1841 #undef __FUNCT__ 1842 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 1843 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1844 { 1845 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1846 IS iscol=a->col,isrow=a->row; 1847 PetscErrorCode ierr; 1848 const PetscInt *r,*c,*rout,*cout; 1849 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1850 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1851 MatScalar *aa=a->a,*v; 1852 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1853 PetscScalar *x,*b,*t; 1854 1855 PetscFunctionBegin; 1856 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1857 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1858 t = a->solve_work; 1859 1860 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1861 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1862 1863 /* copy the b into temp work space according to permutation */ 1864 ii = 0; 1865 for (i=0; i<n; i++) { 1866 ic = 7*c[i]; 1867 t[ii] = b[ic]; 1868 t[ii+1] = b[ic+1]; 1869 t[ii+2] = b[ic+2]; 1870 t[ii+3] = b[ic+3]; 1871 t[ii+4] = b[ic+4]; 1872 t[ii+5] = b[ic+5]; 1873 t[ii+6] = b[ic+6]; 1874 ii += 7; 1875 } 1876 1877 /* forward solve the U^T */ 1878 idx = 0; 1879 for (i=0; i<n; i++) { 1880 1881 v = aa + 49*diag[i]; 1882 /* multiply by the inverse of the block diagonal */ 1883 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1884 x6 = t[5+idx]; x7 = t[6+idx]; 1885 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1886 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1887 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1888 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1889 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1890 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1891 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1892 v += 49; 1893 1894 vi = aj + diag[i] + 1; 1895 nz = ai[i+1] - diag[i] - 1; 1896 while (nz--) { 1897 oidx = 7*(*vi++); 1898 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1899 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1900 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1901 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1902 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1903 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1904 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1905 v += 49; 1906 } 1907 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1908 t[5+idx] = s6;t[6+idx] = s7; 1909 idx += 7; 1910 } 1911 /* backward solve the L^T */ 1912 for (i=n-1; i>=0; i--){ 1913 v = aa + 49*diag[i] - 49; 1914 vi = aj + diag[i] - 1; 1915 nz = diag[i] - ai[i]; 1916 idt = 7*i; 1917 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1918 s6 = t[5+idt];s7 = t[6+idt]; 1919 while (nz--) { 1920 idx = 7*(*vi--); 1921 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1922 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1923 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1924 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1925 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1926 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1927 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1928 v -= 49; 1929 } 1930 } 1931 1932 /* copy t into x according to permutation */ 1933 ii = 0; 1934 for (i=0; i<n; i++) { 1935 ir = 7*r[i]; 1936 x[ir] = t[ii]; 1937 x[ir+1] = t[ii+1]; 1938 x[ir+2] = t[ii+2]; 1939 x[ir+3] = t[ii+3]; 1940 x[ir+4] = t[ii+4]; 1941 x[ir+5] = t[ii+5]; 1942 x[ir+6] = t[ii+6]; 1943 ii += 7; 1944 } 1945 1946 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1947 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1948 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1949 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1950 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1951 PetscFunctionReturn(0); 1952 } 1953 #undef __FUNCT__ 1954 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1955 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1956 { 1957 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1958 PetscErrorCode ierr; 1959 IS iscol=a->col,isrow=a->row; 1960 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1961 const PetscInt *r,*c,*rout,*cout; 1962 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1963 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1964 MatScalar *aa=a->a,*v; 1965 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1966 PetscScalar *x,*b,*t; 1967 1968 PetscFunctionBegin; 1969 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1970 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1971 t = a->solve_work; 1972 1973 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1974 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1975 1976 /* copy b into temp work space according to permutation */ 1977 for(i=0;i<n;i++){ 1978 ii = bs*i; ic = bs*c[i]; 1979 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1980 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 1981 } 1982 1983 /* forward solve the U^T */ 1984 idx = 0; 1985 for (i=0; i<n; i++) { 1986 v = aa + bs2*diag[i]; 1987 /* multiply by the inverse of the block diagonal */ 1988 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1989 x6 = t[5+idx]; x7 = t[6+idx]; 1990 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1991 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1992 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1993 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1994 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1995 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1996 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1997 v -= bs2; 1998 1999 vi = aj + diag[i] - 1; 2000 nz = diag[i] - diag[i+1] - 1; 2001 for(j=0;j>-nz;j--){ 2002 oidx = bs*vi[j]; 2003 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2004 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2005 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2006 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2007 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2008 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2009 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2010 v -= bs2; 2011 } 2012 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 2013 t[5+idx] = s6; t[6+idx] = s7; 2014 idx += bs; 2015 } 2016 /* backward solve the L^T */ 2017 for (i=n-1; i>=0; i--){ 2018 v = aa + bs2*ai[i]; 2019 vi = aj + ai[i]; 2020 nz = ai[i+1] - ai[i]; 2021 idt = bs*i; 2022 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2023 s6 = t[5+idt]; s7 = t[6+idt]; 2024 for(j=0;j<nz;j++){ 2025 idx = bs*vi[j]; 2026 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2027 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2028 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2029 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2030 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2031 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2032 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2033 v += bs2; 2034 } 2035 } 2036 2037 /* copy t into x according to permutation */ 2038 for(i=0;i<n;i++){ 2039 ii = bs*i; ir = bs*r[i]; 2040 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 2041 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 2042 } 2043 2044 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2045 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2046 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2047 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2048 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2049 PetscFunctionReturn(0); 2050 } 2051 2052 /* ----------------------------------------------------------- */ 2053 #undef __FUNCT__ 2054 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 2055 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2056 { 2057 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2058 IS iscol=a->col,isrow=a->row; 2059 PetscErrorCode ierr; 2060 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2061 PetscInt i,n=a->mbs; 2062 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 2063 MatScalar *aa=a->a,*v; 2064 PetscScalar *x,*b,*s,*t,*ls; 2065 2066 PetscFunctionBegin; 2067 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2068 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2069 t = a->solve_work; 2070 2071 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2072 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2073 2074 /* forward solve the lower triangular */ 2075 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2076 for (i=1; i<n; i++) { 2077 v = aa + bs2*ai[i]; 2078 vi = aj + ai[i]; 2079 nz = a->diag[i] - ai[i]; 2080 s = t + bs*i; 2081 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2082 while (nz--) { 2083 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 2084 v += bs2; 2085 } 2086 } 2087 /* backward solve the upper triangular */ 2088 ls = a->solve_work + A->cmap->n; 2089 for (i=n-1; i>=0; i--){ 2090 v = aa + bs2*(a->diag[i] + 1); 2091 vi = aj + a->diag[i] + 1; 2092 nz = ai[i+1] - a->diag[i] - 1; 2093 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2094 while (nz--) { 2095 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 2096 v += bs2; 2097 } 2098 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2099 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2100 } 2101 2102 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2103 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2104 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2105 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2106 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2107 PetscFunctionReturn(0); 2108 } 2109 2110 /* ----------------------------------------------------------- */ 2111 #undef __FUNCT__ 2112 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 2113 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2114 { 2115 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2116 IS iscol=a->col,isrow=a->row; 2117 PetscErrorCode ierr; 2118 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2119 PetscInt i,n=a->mbs,j; 2120 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 2121 const MatScalar *aa=a->a,*v; 2122 PetscScalar *x,*t,*ls; 2123 const PetscScalar *b; 2124 PetscFunctionBegin; 2125 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2126 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2127 t = a->solve_work; 2128 2129 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2130 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2131 2132 /* copy the b into temp work space according to permutation */ 2133 for (i=0; i<n; i++) { 2134 for (j=0; j<bs; j++) { 2135 t[i*bs+j] = b[c[i]*bs+j]; 2136 } 2137 } 2138 2139 2140 /* forward solve the upper triangular transpose */ 2141 ls = a->solve_work + A->cmap->n; 2142 for (i=0; i<n; i++){ 2143 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2144 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2145 v = aa + bs2*(a->diag[i] + 1); 2146 vi = aj + a->diag[i] + 1; 2147 nz = ai[i+1] - a->diag[i] - 1; 2148 while (nz--) { 2149 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2150 v += bs2; 2151 } 2152 } 2153 2154 /* backward solve the lower triangular transpose */ 2155 for (i=n-1; i>=0; i--) { 2156 v = aa + bs2*ai[i]; 2157 vi = aj + ai[i]; 2158 nz = a->diag[i] - ai[i]; 2159 while (nz--) { 2160 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2161 v += bs2; 2162 } 2163 } 2164 2165 /* copy t into x according to permutation */ 2166 for (i=0; i<n; i++) { 2167 for (j=0; j<bs; j++) { 2168 x[bs*r[i]+j] = t[bs*i+j]; 2169 } 2170 } 2171 2172 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2173 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2174 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2175 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2176 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2177 PetscFunctionReturn(0); 2178 } 2179 2180 #undef __FUNCT__ 2181 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 2182 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 2183 { 2184 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2185 IS iscol=a->col,isrow=a->row; 2186 PetscErrorCode ierr; 2187 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2188 PetscInt i,n=a->mbs,j; 2189 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 2190 const MatScalar *aa=a->a,*v; 2191 PetscScalar *x,*t,*ls; 2192 const PetscScalar *b; 2193 PetscFunctionBegin; 2194 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2195 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2196 t = a->solve_work; 2197 2198 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2199 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2200 2201 /* copy the b into temp work space according to permutation */ 2202 for (i=0; i<n; i++) { 2203 for (j=0; j<bs; j++) { 2204 t[i*bs+j] = b[c[i]*bs+j]; 2205 } 2206 } 2207 2208 2209 /* forward solve the upper triangular transpose */ 2210 ls = a->solve_work + A->cmap->n; 2211 for (i=0; i<n; i++){ 2212 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2213 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 2214 v = aa + bs2*(diag[i] - 1); 2215 vi = aj + diag[i] - 1; 2216 nz = diag[i] - diag[i+1] - 1; 2217 for(j=0;j>-nz;j--){ 2218 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2219 v -= bs2; 2220 } 2221 } 2222 2223 /* backward solve the lower triangular transpose */ 2224 for (i=n-1; i>=0; i--) { 2225 v = aa + bs2*ai[i]; 2226 vi = aj + ai[i]; 2227 nz = ai[i+1] - ai[i]; 2228 for(j=0;j<nz;j++){ 2229 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2230 v += bs2; 2231 } 2232 } 2233 2234 /* copy t into x according to permutation */ 2235 for (i=0; i<n; i++) { 2236 for (j=0; j<bs; j++) { 2237 x[bs*r[i]+j] = t[bs*i+j]; 2238 } 2239 } 2240 2241 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2242 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2243 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2244 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2245 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2246 PetscFunctionReturn(0); 2247 } 2248 2249 /* bs = 15 for PFLOTRAN */ 2250 #undef __FUNCT__ 2251 #define __FUNCT__ "MatSolve_SeqBAIJ_15" 2252 PetscErrorCode MatSolve_SeqBAIJ_15(Mat A,Vec bb,Vec xx) 2253 { 2254 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2255 IS iscol=a->col,isrow=a->row; 2256 PetscErrorCode ierr; 2257 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi,bs=A->rmap->bs,bs2=a->bs2; 2258 PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 2259 MatScalar *aa=a->a,*v; 2260 PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 2261 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 2262 PetscScalar *x,*b,*t; 2263 2264 PetscFunctionBegin; 2265 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2266 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2267 t = a->solve_work; 2268 2269 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2270 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2271 2272 /* forward solve the lower triangular */ 2273 idx = bs*r[0]; 2274 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2275 t[5] = b[5+idx]; t[6] = b[6+idx]; t[7] = b[7+idx]; t[8] = b[8+idx]; t[9] = b[9+idx]; 2276 t[10] = b[10+idx]; t[11] = b[11+idx]; t[12] = b[12+idx]; t[13] = b[13+idx]; t[14] = b[14+idx]; 2277 2278 for (i=1; i<n; i++) { 2279 v = aa + bs2*ai[i]; 2280 vi = aj + ai[i]; 2281 nz = ai[i+1] - ai[i]; 2282 idx = bs*r[i]; 2283 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; s4 = b[3+idx]; s5 = b[4+idx]; 2284 s6 = b[5+idx]; s7 = b[6+idx]; s8 = b[7+idx]; s9 = b[8+idx]; s10 = b[9+idx]; 2285 s11 = b[10+idx]; s12 = b[11+idx]; s13 = b[12+idx]; s14 = b[13+idx]; s15 = b[14+idx]; 2286 for(m=0;m<nz;m++){ 2287 idx = bs*vi[m]; 2288 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2289 x6 = t[5+idx]; x7 = t[6+idx]; x8 = t[7+idx]; x9 = t[8+idx]; x10 = t[9+idx]; 2290 x11 = t[10+idx]; x12 = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx]; 2291 2292 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2293 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2294 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2295 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2296 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2297 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2298 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2299 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2300 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2301 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2302 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2303 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2304 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2305 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2306 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2307 2308 v += bs2; 2309 } 2310 idx = bs*i; 2311 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] = s5; 2312 t[5+idx] = s6; t[6+idx] = s7; t[7+idx] = s8; t[8+idx] = s9; t[9+idx] = s10; 2313 t[10+idx] = s11; t[11+idx] = s12; t[12+idx] = s13; t[13+idx] = s14; t[14+idx] = s15; 2314 2315 } 2316 /* backward solve the upper triangular */ 2317 for (i=n-1; i>=0; i--){ 2318 v = aa + bs2*(adiag[i+1]+1); 2319 vi = aj + adiag[i+1]+1; 2320 nz = adiag[i] - adiag[i+1] - 1; 2321 idt = bs*i; 2322 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2323 s6 = t[5+idt]; s7 = t[6+idt]; s8 = t[7+idt]; s9 = t[8+idt]; s10 = t[9+idt]; 2324 s11 = t[10+idt]; s12 = t[11+idt]; s13 = t[12+idt]; s14 = t[13+idt]; s15 = t[14+idt]; 2325 2326 for(m=0;m<nz;m++){ 2327 idx = bs*vi[m]; 2328 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2329 x6 = t[5+idx]; x7 = t[6+idx]; x8 = t[7+idx]; x9 = t[8+idx]; x10 = t[9+idx]; 2330 x11 = t[10+idx]; x12 = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx]; 2331 2332 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2333 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2334 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2335 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2336 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2337 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2338 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2339 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2340 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2341 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2342 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2343 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2344 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2345 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2346 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2347 2348 v += bs2; 2349 } 2350 idc = bs*c[i]; 2351 2352 x[idc] = t[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 2353 x[1+idc] = t[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 2354 x[2+idc] = t[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 2355 x[3+idc] = t[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 2356 x[4+idc] = t[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 2357 x[5+idc] = t[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 2358 x[6+idc] = t[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 2359 x[7+idc] = t[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 2360 x[8+idc] = t[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 2361 x[9+idc] = t[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 2362 x[10+idc] = t[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 2363 x[11+idc] = t[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 2364 x[12+idc] = t[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 2365 x[13+idc] = t[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 2366 x[14+idc] = t[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 2367 2368 } 2369 2370 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2371 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2372 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2373 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2374 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2375 PetscFunctionReturn(0); 2376 } 2377 2378 #undef __FUNCT__ 2379 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 2380 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 2381 { 2382 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2383 IS iscol=a->col,isrow=a->row; 2384 PetscErrorCode ierr; 2385 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 2386 PetscInt i,n=a->mbs,nz,idx,idt,idc; 2387 MatScalar *aa=a->a,*v; 2388 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2389 PetscScalar *x,*b,*t; 2390 2391 PetscFunctionBegin; 2392 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2393 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2394 t = a->solve_work; 2395 2396 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2397 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2398 2399 /* forward solve the lower triangular */ 2400 idx = 7*(*r++); 2401 t[0] = b[idx]; t[1] = b[1+idx]; 2402 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2403 t[5] = b[5+idx]; t[6] = b[6+idx]; 2404 2405 for (i=1; i<n; i++) { 2406 v = aa + 49*ai[i]; 2407 vi = aj + ai[i]; 2408 nz = diag[i] - ai[i]; 2409 idx = 7*(*r++); 2410 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2411 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2412 while (nz--) { 2413 idx = 7*(*vi++); 2414 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2415 x4 = t[3+idx];x5 = t[4+idx]; 2416 x6 = t[5+idx];x7 = t[6+idx]; 2417 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2418 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2419 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2420 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2421 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2422 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2423 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2424 v += 49; 2425 } 2426 idx = 7*i; 2427 t[idx] = s1;t[1+idx] = s2; 2428 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2429 t[5+idx] = s6;t[6+idx] = s7; 2430 } 2431 /* backward solve the upper triangular */ 2432 for (i=n-1; i>=0; i--){ 2433 v = aa + 49*diag[i] + 49; 2434 vi = aj + diag[i] + 1; 2435 nz = ai[i+1] - diag[i] - 1; 2436 idt = 7*i; 2437 s1 = t[idt]; s2 = t[1+idt]; 2438 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2439 s6 = t[5+idt];s7 = t[6+idt]; 2440 while (nz--) { 2441 idx = 7*(*vi++); 2442 x1 = t[idx]; x2 = t[1+idx]; 2443 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2444 x6 = t[5+idx]; x7 = t[6+idx]; 2445 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2446 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2447 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2448 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2449 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2450 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2451 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2452 v += 49; 2453 } 2454 idc = 7*(*c--); 2455 v = aa + 49*diag[i]; 2456 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2457 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2458 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2459 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2460 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2461 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2462 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2463 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2464 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2465 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2466 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2467 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2468 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2469 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2470 } 2471 2472 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2473 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2474 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2475 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2476 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2477 PetscFunctionReturn(0); 2478 } 2479 2480 #undef __FUNCT__ 2481 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 2482 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2483 { 2484 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2485 IS iscol=a->col,isrow=a->row; 2486 PetscErrorCode ierr; 2487 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 2488 PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 2489 MatScalar *aa=a->a,*v; 2490 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2491 PetscScalar *x,*b,*t; 2492 2493 PetscFunctionBegin; 2494 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2495 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2496 t = a->solve_work; 2497 2498 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2499 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2500 2501 /* forward solve the lower triangular */ 2502 idx = 7*r[0]; 2503 t[0] = b[idx]; t[1] = b[1+idx]; 2504 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2505 t[5] = b[5+idx]; t[6] = b[6+idx]; 2506 2507 for (i=1; i<n; i++) { 2508 v = aa + 49*ai[i]; 2509 vi = aj + ai[i]; 2510 nz = ai[i+1] - ai[i]; 2511 idx = 7*r[i]; 2512 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2513 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2514 for(m=0;m<nz;m++){ 2515 idx = 7*vi[m]; 2516 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2517 x4 = t[3+idx];x5 = t[4+idx]; 2518 x6 = t[5+idx];x7 = t[6+idx]; 2519 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2520 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2521 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2522 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2523 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2524 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2525 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2526 v += 49; 2527 } 2528 idx = 7*i; 2529 t[idx] = s1;t[1+idx] = s2; 2530 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2531 t[5+idx] = s6;t[6+idx] = s7; 2532 } 2533 /* backward solve the upper triangular */ 2534 for (i=n-1; i>=0; i--){ 2535 v = aa + 49*(adiag[i+1]+1); 2536 vi = aj + adiag[i+1]+1; 2537 nz = adiag[i] - adiag[i+1] - 1; 2538 idt = 7*i; 2539 s1 = t[idt]; s2 = t[1+idt]; 2540 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2541 s6 = t[5+idt];s7 = t[6+idt]; 2542 for(m=0;m<nz;m++){ 2543 idx = 7*vi[m]; 2544 x1 = t[idx]; x2 = t[1+idx]; 2545 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2546 x6 = t[5+idx]; x7 = t[6+idx]; 2547 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2548 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2549 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2550 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2551 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2552 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2553 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2554 v += 49; 2555 } 2556 idc = 7*c[i]; 2557 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2558 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2559 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2560 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2561 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2562 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2563 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2564 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2565 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2566 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2567 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2568 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2569 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2570 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2571 } 2572 2573 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2574 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2575 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2576 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2577 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2578 PetscFunctionReturn(0); 2579 } 2580 2581 #undef __FUNCT__ 2582 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 2583 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2584 { 2585 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2586 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2587 PetscErrorCode ierr; 2588 PetscInt *diag = a->diag,jdx; 2589 const MatScalar *aa=a->a,*v; 2590 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2591 const PetscScalar *b; 2592 2593 PetscFunctionBegin; 2594 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2595 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2596 /* forward solve the lower triangular */ 2597 idx = 0; 2598 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2599 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2600 x[6] = b[6+idx]; 2601 for (i=1; i<n; i++) { 2602 v = aa + 49*ai[i]; 2603 vi = aj + ai[i]; 2604 nz = diag[i] - ai[i]; 2605 idx = 7*i; 2606 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2607 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2608 s7 = b[6+idx]; 2609 while (nz--) { 2610 jdx = 7*(*vi++); 2611 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2612 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2613 x7 = x[6+jdx]; 2614 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2615 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2616 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2617 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2618 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2619 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2620 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2621 v += 49; 2622 } 2623 x[idx] = s1; 2624 x[1+idx] = s2; 2625 x[2+idx] = s3; 2626 x[3+idx] = s4; 2627 x[4+idx] = s5; 2628 x[5+idx] = s6; 2629 x[6+idx] = s7; 2630 } 2631 /* backward solve the upper triangular */ 2632 for (i=n-1; i>=0; i--){ 2633 v = aa + 49*diag[i] + 49; 2634 vi = aj + diag[i] + 1; 2635 nz = ai[i+1] - diag[i] - 1; 2636 idt = 7*i; 2637 s1 = x[idt]; s2 = x[1+idt]; 2638 s3 = x[2+idt]; s4 = x[3+idt]; 2639 s5 = x[4+idt]; s6 = x[5+idt]; 2640 s7 = x[6+idt]; 2641 while (nz--) { 2642 idx = 7*(*vi++); 2643 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2644 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2645 x7 = x[6+idx]; 2646 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2647 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2648 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2649 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2650 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2651 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2652 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2653 v += 49; 2654 } 2655 v = aa + 49*diag[i]; 2656 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2657 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2658 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2659 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2660 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2661 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2662 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2663 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2664 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2665 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2666 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2667 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2668 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2669 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2670 } 2671 2672 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2673 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2674 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2675 PetscFunctionReturn(0); 2676 } 2677 2678 #undef __FUNCT__ 2679 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 2680 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 2681 { 2682 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2683 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2684 PetscErrorCode ierr; 2685 PetscInt idx,jdx,idt; 2686 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2687 const MatScalar *aa=a->a,*v; 2688 PetscScalar *x; 2689 const PetscScalar *b; 2690 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2691 2692 PetscFunctionBegin; 2693 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2694 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2695 /* forward solve the lower triangular */ 2696 idx = 0; 2697 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2698 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 2699 for (i=1; i<n; i++) { 2700 v = aa + bs2*ai[i]; 2701 vi = aj + ai[i]; 2702 nz = ai[i+1] - ai[i]; 2703 idx = bs*i; 2704 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2705 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2706 for(k=0;k<nz;k++) { 2707 jdx = bs*vi[k]; 2708 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2709 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 2710 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2711 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2712 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2713 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2714 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2715 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2716 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2717 v += bs2; 2718 } 2719 2720 x[idx] = s1; 2721 x[1+idx] = s2; 2722 x[2+idx] = s3; 2723 x[3+idx] = s4; 2724 x[4+idx] = s5; 2725 x[5+idx] = s6; 2726 x[6+idx] = s7; 2727 } 2728 2729 /* backward solve the upper triangular */ 2730 for (i=n-1; i>=0; i--){ 2731 v = aa + bs2*(adiag[i+1]+1); 2732 vi = aj + adiag[i+1]+1; 2733 nz = adiag[i] - adiag[i+1]-1; 2734 idt = bs*i; 2735 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2736 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 2737 for(k=0;k<nz;k++) { 2738 idx = bs*vi[k]; 2739 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2740 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 2741 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2742 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2743 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2744 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2745 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2746 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2747 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2748 v += bs2; 2749 } 2750 /* x = inv_diagonal*x */ 2751 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2752 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2753 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2754 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2755 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2756 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2757 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2758 } 2759 2760 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2761 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2762 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2763 PetscFunctionReturn(0); 2764 } 2765 2766 #undef __FUNCT__ 2767 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 2768 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 2769 { 2770 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2771 IS iscol=a->col,isrow=a->row; 2772 PetscErrorCode ierr; 2773 const PetscInt *r,*c,*rout,*cout; 2774 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2775 const MatScalar *aa=a->a,*v; 2776 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2777 const PetscScalar *b; 2778 PetscFunctionBegin; 2779 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2780 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2781 t = a->solve_work; 2782 2783 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2784 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2785 2786 /* forward solve the lower triangular */ 2787 idx = 6*(*r++); 2788 t[0] = b[idx]; t[1] = b[1+idx]; 2789 t[2] = b[2+idx]; t[3] = b[3+idx]; 2790 t[4] = b[4+idx]; t[5] = b[5+idx]; 2791 for (i=1; i<n; i++) { 2792 v = aa + 36*ai[i]; 2793 vi = aj + ai[i]; 2794 nz = diag[i] - ai[i]; 2795 idx = 6*(*r++); 2796 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2797 s5 = b[4+idx]; s6 = b[5+idx]; 2798 while (nz--) { 2799 idx = 6*(*vi++); 2800 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2801 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2802 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2803 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2804 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2805 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2806 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2807 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2808 v += 36; 2809 } 2810 idx = 6*i; 2811 t[idx] = s1;t[1+idx] = s2; 2812 t[2+idx] = s3;t[3+idx] = s4; 2813 t[4+idx] = s5;t[5+idx] = s6; 2814 } 2815 /* backward solve the upper triangular */ 2816 for (i=n-1; i>=0; i--){ 2817 v = aa + 36*diag[i] + 36; 2818 vi = aj + diag[i] + 1; 2819 nz = ai[i+1] - diag[i] - 1; 2820 idt = 6*i; 2821 s1 = t[idt]; s2 = t[1+idt]; 2822 s3 = t[2+idt];s4 = t[3+idt]; 2823 s5 = t[4+idt];s6 = t[5+idt]; 2824 while (nz--) { 2825 idx = 6*(*vi++); 2826 x1 = t[idx]; x2 = t[1+idx]; 2827 x3 = t[2+idx]; x4 = t[3+idx]; 2828 x5 = t[4+idx]; x6 = t[5+idx]; 2829 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2830 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2831 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2832 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2833 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2834 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2835 v += 36; 2836 } 2837 idc = 6*(*c--); 2838 v = aa + 36*diag[i]; 2839 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2840 v[18]*s4+v[24]*s5+v[30]*s6; 2841 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2842 v[19]*s4+v[25]*s5+v[31]*s6; 2843 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2844 v[20]*s4+v[26]*s5+v[32]*s6; 2845 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2846 v[21]*s4+v[27]*s5+v[33]*s6; 2847 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2848 v[22]*s4+v[28]*s5+v[34]*s6; 2849 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2850 v[23]*s4+v[29]*s5+v[35]*s6; 2851 } 2852 2853 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2854 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2855 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2856 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2857 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2858 PetscFunctionReturn(0); 2859 } 2860 2861 #undef __FUNCT__ 2862 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 2863 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 2864 { 2865 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2866 IS iscol=a->col,isrow=a->row; 2867 PetscErrorCode ierr; 2868 const PetscInt *r,*c,*rout,*cout; 2869 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2870 const MatScalar *aa=a->a,*v; 2871 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2872 const PetscScalar *b; 2873 PetscFunctionBegin; 2874 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2875 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2876 t = a->solve_work; 2877 2878 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2879 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2880 2881 /* forward solve the lower triangular */ 2882 idx = 6*r[0]; 2883 t[0] = b[idx]; t[1] = b[1+idx]; 2884 t[2] = b[2+idx]; t[3] = b[3+idx]; 2885 t[4] = b[4+idx]; t[5] = b[5+idx]; 2886 for (i=1; i<n; i++) { 2887 v = aa + 36*ai[i]; 2888 vi = aj + ai[i]; 2889 nz = ai[i+1] - ai[i]; 2890 idx = 6*r[i]; 2891 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2892 s5 = b[4+idx]; s6 = b[5+idx]; 2893 for(m=0;m<nz;m++){ 2894 idx = 6*vi[m]; 2895 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2896 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2897 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2898 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2899 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2900 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2901 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2902 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2903 v += 36; 2904 } 2905 idx = 6*i; 2906 t[idx] = s1;t[1+idx] = s2; 2907 t[2+idx] = s3;t[3+idx] = s4; 2908 t[4+idx] = s5;t[5+idx] = s6; 2909 } 2910 /* backward solve the upper triangular */ 2911 for (i=n-1; i>=0; i--){ 2912 v = aa + 36*(adiag[i+1]+1); 2913 vi = aj + adiag[i+1]+1; 2914 nz = adiag[i] - adiag[i+1] - 1; 2915 idt = 6*i; 2916 s1 = t[idt]; s2 = t[1+idt]; 2917 s3 = t[2+idt];s4 = t[3+idt]; 2918 s5 = t[4+idt];s6 = t[5+idt]; 2919 for(m=0;m<nz;m++){ 2920 idx = 6*vi[m]; 2921 x1 = t[idx]; x2 = t[1+idx]; 2922 x3 = t[2+idx]; x4 = t[3+idx]; 2923 x5 = t[4+idx]; x6 = t[5+idx]; 2924 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2925 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2926 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2927 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2928 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2929 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2930 v += 36; 2931 } 2932 idc = 6*c[i]; 2933 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2934 v[18]*s4+v[24]*s5+v[30]*s6; 2935 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2936 v[19]*s4+v[25]*s5+v[31]*s6; 2937 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2938 v[20]*s4+v[26]*s5+v[32]*s6; 2939 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2940 v[21]*s4+v[27]*s5+v[33]*s6; 2941 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2942 v[22]*s4+v[28]*s5+v[34]*s6; 2943 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2944 v[23]*s4+v[29]*s5+v[35]*s6; 2945 } 2946 2947 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2948 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2949 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2950 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2951 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2952 PetscFunctionReturn(0); 2953 } 2954 2955 #undef __FUNCT__ 2956 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 2957 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2958 { 2959 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2960 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2961 PetscErrorCode ierr; 2962 PetscInt *diag = a->diag,jdx; 2963 const MatScalar *aa=a->a,*v; 2964 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2965 const PetscScalar *b; 2966 2967 PetscFunctionBegin; 2968 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2969 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2970 /* forward solve the lower triangular */ 2971 idx = 0; 2972 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2973 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2974 for (i=1; i<n; i++) { 2975 v = aa + 36*ai[i]; 2976 vi = aj + ai[i]; 2977 nz = diag[i] - ai[i]; 2978 idx = 6*i; 2979 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2980 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2981 while (nz--) { 2982 jdx = 6*(*vi++); 2983 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2984 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2985 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2986 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2987 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2988 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2989 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2990 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2991 v += 36; 2992 } 2993 x[idx] = s1; 2994 x[1+idx] = s2; 2995 x[2+idx] = s3; 2996 x[3+idx] = s4; 2997 x[4+idx] = s5; 2998 x[5+idx] = s6; 2999 } 3000 /* backward solve the upper triangular */ 3001 for (i=n-1; i>=0; i--){ 3002 v = aa + 36*diag[i] + 36; 3003 vi = aj + diag[i] + 1; 3004 nz = ai[i+1] - diag[i] - 1; 3005 idt = 6*i; 3006 s1 = x[idt]; s2 = x[1+idt]; 3007 s3 = x[2+idt]; s4 = x[3+idt]; 3008 s5 = x[4+idt]; s6 = x[5+idt]; 3009 while (nz--) { 3010 idx = 6*(*vi++); 3011 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3012 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3013 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3014 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3015 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3016 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3017 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3018 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3019 v += 36; 3020 } 3021 v = aa + 36*diag[i]; 3022 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3023 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3024 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3025 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3026 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3027 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3028 } 3029 3030 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3031 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3032 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3033 PetscFunctionReturn(0); 3034 } 3035 3036 #undef __FUNCT__ 3037 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 3038 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 3039 { 3040 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3041 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3042 PetscErrorCode ierr; 3043 PetscInt idx,jdx,idt; 3044 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3045 const MatScalar *aa=a->a,*v; 3046 PetscScalar *x; 3047 const PetscScalar *b; 3048 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3049 3050 PetscFunctionBegin; 3051 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3052 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3053 /* forward solve the lower triangular */ 3054 idx = 0; 3055 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3056 x[4] = b[4+idx];x[5] = b[5+idx]; 3057 for (i=1; i<n; i++) { 3058 v = aa + bs2*ai[i]; 3059 vi = aj + ai[i]; 3060 nz = ai[i+1] - ai[i]; 3061 idx = bs*i; 3062 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3063 s5 = b[4+idx];s6 = b[5+idx]; 3064 for(k=0;k<nz;k++){ 3065 jdx = bs*vi[k]; 3066 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3067 x5 = x[4+jdx]; x6 = x[5+jdx]; 3068 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3069 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3070 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3071 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3072 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3073 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3074 v += bs2; 3075 } 3076 3077 x[idx] = s1; 3078 x[1+idx] = s2; 3079 x[2+idx] = s3; 3080 x[3+idx] = s4; 3081 x[4+idx] = s5; 3082 x[5+idx] = s6; 3083 } 3084 3085 /* backward solve the upper triangular */ 3086 for (i=n-1; i>=0; i--){ 3087 v = aa + bs2*(adiag[i+1]+1); 3088 vi = aj + adiag[i+1]+1; 3089 nz = adiag[i] - adiag[i+1]-1; 3090 idt = bs*i; 3091 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3092 s5 = x[4+idt];s6 = x[5+idt]; 3093 for(k=0;k<nz;k++){ 3094 idx = bs*vi[k]; 3095 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3096 x5 = x[4+idx];x6 = x[5+idx]; 3097 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3098 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3099 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3100 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3101 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3102 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3103 v += bs2; 3104 } 3105 /* x = inv_diagonal*x */ 3106 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3107 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3108 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3109 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3110 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3111 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3112 } 3113 3114 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3115 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3116 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3117 PetscFunctionReturn(0); 3118 } 3119 3120 #undef __FUNCT__ 3121 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 3122 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 3123 { 3124 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3125 IS iscol=a->col,isrow=a->row; 3126 PetscErrorCode ierr; 3127 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3128 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3129 const MatScalar *aa=a->a,*v; 3130 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3131 const PetscScalar *b; 3132 3133 PetscFunctionBegin; 3134 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3135 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3136 t = a->solve_work; 3137 3138 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3139 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3140 3141 /* forward solve the lower triangular */ 3142 idx = 5*(*r++); 3143 t[0] = b[idx]; t[1] = b[1+idx]; 3144 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3145 for (i=1; i<n; i++) { 3146 v = aa + 25*ai[i]; 3147 vi = aj + ai[i]; 3148 nz = diag[i] - ai[i]; 3149 idx = 5*(*r++); 3150 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3151 s5 = b[4+idx]; 3152 while (nz--) { 3153 idx = 5*(*vi++); 3154 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3155 x4 = t[3+idx];x5 = t[4+idx]; 3156 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3157 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3158 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3159 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3160 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3161 v += 25; 3162 } 3163 idx = 5*i; 3164 t[idx] = s1;t[1+idx] = s2; 3165 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3166 } 3167 /* backward solve the upper triangular */ 3168 for (i=n-1; i>=0; i--){ 3169 v = aa + 25*diag[i] + 25; 3170 vi = aj + diag[i] + 1; 3171 nz = ai[i+1] - diag[i] - 1; 3172 idt = 5*i; 3173 s1 = t[idt]; s2 = t[1+idt]; 3174 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3175 while (nz--) { 3176 idx = 5*(*vi++); 3177 x1 = t[idx]; x2 = t[1+idx]; 3178 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3179 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3180 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3181 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3182 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3183 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3184 v += 25; 3185 } 3186 idc = 5*(*c--); 3187 v = aa + 25*diag[i]; 3188 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3189 v[15]*s4+v[20]*s5; 3190 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3191 v[16]*s4+v[21]*s5; 3192 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3193 v[17]*s4+v[22]*s5; 3194 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3195 v[18]*s4+v[23]*s5; 3196 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3197 v[19]*s4+v[24]*s5; 3198 } 3199 3200 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3201 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3202 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3203 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3204 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3205 PetscFunctionReturn(0); 3206 } 3207 3208 #undef __FUNCT__ 3209 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 3210 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 3211 { 3212 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3213 IS iscol=a->col,isrow=a->row; 3214 PetscErrorCode ierr; 3215 const PetscInt *r,*c,*rout,*cout; 3216 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 3217 const MatScalar *aa=a->a,*v; 3218 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3219 const PetscScalar *b; 3220 3221 PetscFunctionBegin; 3222 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3223 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3224 t = a->solve_work; 3225 3226 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3227 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3228 3229 /* forward solve the lower triangular */ 3230 idx = 5*r[0]; 3231 t[0] = b[idx]; t[1] = b[1+idx]; 3232 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3233 for (i=1; i<n; i++) { 3234 v = aa + 25*ai[i]; 3235 vi = aj + ai[i]; 3236 nz = ai[i+1] - ai[i]; 3237 idx = 5*r[i]; 3238 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3239 s5 = b[4+idx]; 3240 for(m=0;m<nz;m++){ 3241 idx = 5*vi[m]; 3242 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3243 x4 = t[3+idx];x5 = t[4+idx]; 3244 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3245 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3246 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3247 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3248 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3249 v += 25; 3250 } 3251 idx = 5*i; 3252 t[idx] = s1;t[1+idx] = s2; 3253 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3254 } 3255 /* backward solve the upper triangular */ 3256 for (i=n-1; i>=0; i--){ 3257 v = aa + 25*(adiag[i+1]+1); 3258 vi = aj + adiag[i+1]+1; 3259 nz = adiag[i] - adiag[i+1] - 1; 3260 idt = 5*i; 3261 s1 = t[idt]; s2 = t[1+idt]; 3262 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3263 for(m=0;m<nz;m++){ 3264 idx = 5*vi[m]; 3265 x1 = t[idx]; x2 = t[1+idx]; 3266 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3267 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3268 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3269 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3270 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3271 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3272 v += 25; 3273 } 3274 idc = 5*c[i]; 3275 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3276 v[15]*s4+v[20]*s5; 3277 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3278 v[16]*s4+v[21]*s5; 3279 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3280 v[17]*s4+v[22]*s5; 3281 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3282 v[18]*s4+v[23]*s5; 3283 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3284 v[19]*s4+v[24]*s5; 3285 } 3286 3287 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3288 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3289 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3290 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3291 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3292 PetscFunctionReturn(0); 3293 } 3294 3295 #undef __FUNCT__ 3296 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 3297 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3298 { 3299 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3300 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 3301 PetscErrorCode ierr; 3302 PetscInt *diag = a->diag,jdx; 3303 const MatScalar *aa=a->a,*v; 3304 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3305 const PetscScalar *b; 3306 3307 PetscFunctionBegin; 3308 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3309 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3310 /* forward solve the lower triangular */ 3311 idx = 0; 3312 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3313 for (i=1; i<n; i++) { 3314 v = aa + 25*ai[i]; 3315 vi = aj + ai[i]; 3316 nz = diag[i] - ai[i]; 3317 idx = 5*i; 3318 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3319 while (nz--) { 3320 jdx = 5*(*vi++); 3321 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3322 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3323 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3324 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3325 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3326 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3327 v += 25; 3328 } 3329 x[idx] = s1; 3330 x[1+idx] = s2; 3331 x[2+idx] = s3; 3332 x[3+idx] = s4; 3333 x[4+idx] = s5; 3334 } 3335 /* backward solve the upper triangular */ 3336 for (i=n-1; i>=0; i--){ 3337 v = aa + 25*diag[i] + 25; 3338 vi = aj + diag[i] + 1; 3339 nz = ai[i+1] - diag[i] - 1; 3340 idt = 5*i; 3341 s1 = x[idt]; s2 = x[1+idt]; 3342 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3343 while (nz--) { 3344 idx = 5*(*vi++); 3345 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3346 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3347 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3348 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3349 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3350 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3351 v += 25; 3352 } 3353 v = aa + 25*diag[i]; 3354 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3355 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3356 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3357 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3358 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3359 } 3360 3361 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3362 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3363 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3364 PetscFunctionReturn(0); 3365 } 3366 3367 #undef __FUNCT__ 3368 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 3369 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 3370 { 3371 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3372 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 3373 PetscErrorCode ierr; 3374 PetscInt jdx; 3375 const MatScalar *aa=a->a,*v; 3376 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3377 const PetscScalar *b; 3378 3379 PetscFunctionBegin; 3380 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3381 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3382 /* forward solve the lower triangular */ 3383 idx = 0; 3384 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3385 for (i=1; i<n; i++) { 3386 v = aa + 25*ai[i]; 3387 vi = aj + ai[i]; 3388 nz = ai[i+1] - ai[i]; 3389 idx = 5*i; 3390 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3391 for(k=0;k<nz;k++) { 3392 jdx = 5*vi[k]; 3393 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3394 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3395 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3396 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3397 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3398 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3399 v += 25; 3400 } 3401 x[idx] = s1; 3402 x[1+idx] = s2; 3403 x[2+idx] = s3; 3404 x[3+idx] = s4; 3405 x[4+idx] = s5; 3406 } 3407 3408 /* backward solve the upper triangular */ 3409 for (i=n-1; i>=0; i--){ 3410 v = aa + 25*(adiag[i+1]+1); 3411 vi = aj + adiag[i+1]+1; 3412 nz = adiag[i] - adiag[i+1]-1; 3413 idt = 5*i; 3414 s1 = x[idt]; s2 = x[1+idt]; 3415 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3416 for(k=0;k<nz;k++){ 3417 idx = 5*vi[k]; 3418 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3419 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3420 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3421 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3422 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3423 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3424 v += 25; 3425 } 3426 /* x = inv_diagonal*x */ 3427 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3428 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3429 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3430 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3431 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3432 } 3433 3434 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3435 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3436 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3437 PetscFunctionReturn(0); 3438 } 3439 3440 #undef __FUNCT__ 3441 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 3442 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 3443 { 3444 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3445 IS iscol=a->col,isrow=a->row; 3446 PetscErrorCode ierr; 3447 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3448 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3449 const MatScalar *aa=a->a,*v; 3450 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3451 const PetscScalar *b; 3452 3453 PetscFunctionBegin; 3454 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3455 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3456 t = a->solve_work; 3457 3458 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3459 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3460 3461 /* forward solve the lower triangular */ 3462 idx = 4*(*r++); 3463 t[0] = b[idx]; t[1] = b[1+idx]; 3464 t[2] = b[2+idx]; t[3] = b[3+idx]; 3465 for (i=1; i<n; i++) { 3466 v = aa + 16*ai[i]; 3467 vi = aj + ai[i]; 3468 nz = diag[i] - ai[i]; 3469 idx = 4*(*r++); 3470 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3471 while (nz--) { 3472 idx = 4*(*vi++); 3473 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3474 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3475 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3476 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3477 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3478 v += 16; 3479 } 3480 idx = 4*i; 3481 t[idx] = s1;t[1+idx] = s2; 3482 t[2+idx] = s3;t[3+idx] = s4; 3483 } 3484 /* backward solve the upper triangular */ 3485 for (i=n-1; i>=0; i--){ 3486 v = aa + 16*diag[i] + 16; 3487 vi = aj + diag[i] + 1; 3488 nz = ai[i+1] - diag[i] - 1; 3489 idt = 4*i; 3490 s1 = t[idt]; s2 = t[1+idt]; 3491 s3 = t[2+idt];s4 = t[3+idt]; 3492 while (nz--) { 3493 idx = 4*(*vi++); 3494 x1 = t[idx]; x2 = t[1+idx]; 3495 x3 = t[2+idx]; x4 = t[3+idx]; 3496 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3497 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3498 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3499 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3500 v += 16; 3501 } 3502 idc = 4*(*c--); 3503 v = aa + 16*diag[i]; 3504 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3505 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3506 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3507 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3508 } 3509 3510 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3511 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3512 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3513 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3514 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3515 PetscFunctionReturn(0); 3516 } 3517 3518 #undef __FUNCT__ 3519 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 3520 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 3521 { 3522 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3523 IS iscol=a->col,isrow=a->row; 3524 PetscErrorCode ierr; 3525 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 3526 const PetscInt *r,*c,*rout,*cout; 3527 const MatScalar *aa=a->a,*v; 3528 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3529 const PetscScalar *b; 3530 3531 PetscFunctionBegin; 3532 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3533 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3534 t = a->solve_work; 3535 3536 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3537 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3538 3539 /* forward solve the lower triangular */ 3540 idx = 4*r[0]; 3541 t[0] = b[idx]; t[1] = b[1+idx]; 3542 t[2] = b[2+idx]; t[3] = b[3+idx]; 3543 for (i=1; i<n; i++) { 3544 v = aa + 16*ai[i]; 3545 vi = aj + ai[i]; 3546 nz = ai[i+1] - ai[i]; 3547 idx = 4*r[i]; 3548 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3549 for(m=0;m<nz;m++){ 3550 idx = 4*vi[m]; 3551 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3552 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3553 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3554 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3555 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3556 v += 16; 3557 } 3558 idx = 4*i; 3559 t[idx] = s1;t[1+idx] = s2; 3560 t[2+idx] = s3;t[3+idx] = s4; 3561 } 3562 /* backward solve the upper triangular */ 3563 for (i=n-1; i>=0; i--){ 3564 v = aa + 16*(adiag[i+1]+1); 3565 vi = aj + adiag[i+1]+1; 3566 nz = adiag[i] - adiag[i+1] - 1; 3567 idt = 4*i; 3568 s1 = t[idt]; s2 = t[1+idt]; 3569 s3 = t[2+idt];s4 = t[3+idt]; 3570 for(m=0;m<nz;m++){ 3571 idx = 4*vi[m]; 3572 x1 = t[idx]; x2 = t[1+idx]; 3573 x3 = t[2+idx]; x4 = t[3+idx]; 3574 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3575 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3576 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3577 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3578 v += 16; 3579 } 3580 idc = 4*c[i]; 3581 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3582 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3583 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3584 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3585 } 3586 3587 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3588 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3589 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3590 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3591 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3592 PetscFunctionReturn(0); 3593 } 3594 3595 #undef __FUNCT__ 3596 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3597 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3598 { 3599 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3600 IS iscol=a->col,isrow=a->row; 3601 PetscErrorCode ierr; 3602 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3603 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3604 const MatScalar *aa=a->a,*v; 3605 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3606 PetscScalar *x; 3607 const PetscScalar *b; 3608 3609 PetscFunctionBegin; 3610 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3611 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3612 t = (MatScalar *)a->solve_work; 3613 3614 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3615 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3616 3617 /* forward solve the lower triangular */ 3618 idx = 4*(*r++); 3619 t[0] = (MatScalar)b[idx]; 3620 t[1] = (MatScalar)b[1+idx]; 3621 t[2] = (MatScalar)b[2+idx]; 3622 t[3] = (MatScalar)b[3+idx]; 3623 for (i=1; i<n; i++) { 3624 v = aa + 16*ai[i]; 3625 vi = aj + ai[i]; 3626 nz = diag[i] - ai[i]; 3627 idx = 4*(*r++); 3628 s1 = (MatScalar)b[idx]; 3629 s2 = (MatScalar)b[1+idx]; 3630 s3 = (MatScalar)b[2+idx]; 3631 s4 = (MatScalar)b[3+idx]; 3632 while (nz--) { 3633 idx = 4*(*vi++); 3634 x1 = t[idx]; 3635 x2 = t[1+idx]; 3636 x3 = t[2+idx]; 3637 x4 = t[3+idx]; 3638 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3639 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3640 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3641 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3642 v += 16; 3643 } 3644 idx = 4*i; 3645 t[idx] = s1; 3646 t[1+idx] = s2; 3647 t[2+idx] = s3; 3648 t[3+idx] = s4; 3649 } 3650 /* backward solve the upper triangular */ 3651 for (i=n-1; i>=0; i--){ 3652 v = aa + 16*diag[i] + 16; 3653 vi = aj + diag[i] + 1; 3654 nz = ai[i+1] - diag[i] - 1; 3655 idt = 4*i; 3656 s1 = t[idt]; 3657 s2 = t[1+idt]; 3658 s3 = t[2+idt]; 3659 s4 = t[3+idt]; 3660 while (nz--) { 3661 idx = 4*(*vi++); 3662 x1 = t[idx]; 3663 x2 = t[1+idx]; 3664 x3 = t[2+idx]; 3665 x4 = t[3+idx]; 3666 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3667 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3668 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3669 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3670 v += 16; 3671 } 3672 idc = 4*(*c--); 3673 v = aa + 16*diag[i]; 3674 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3675 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3676 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3677 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3678 x[idc] = (PetscScalar)t[idt]; 3679 x[1+idc] = (PetscScalar)t[1+idt]; 3680 x[2+idc] = (PetscScalar)t[2+idt]; 3681 x[3+idc] = (PetscScalar)t[3+idt]; 3682 } 3683 3684 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3685 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3686 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3687 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3688 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3689 PetscFunctionReturn(0); 3690 } 3691 3692 #if defined (PETSC_HAVE_SSE) 3693 3694 #include PETSC_HAVE_SSE 3695 3696 #undef __FUNCT__ 3697 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3698 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3699 { 3700 /* 3701 Note: This code uses demotion of double 3702 to float when performing the mixed-mode computation. 3703 This may not be numerically reasonable for all applications. 3704 */ 3705 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3706 IS iscol=a->col,isrow=a->row; 3707 PetscErrorCode ierr; 3708 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3709 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3710 MatScalar *aa=a->a,*v; 3711 PetscScalar *x,*b,*t; 3712 3713 /* Make space in temp stack for 16 Byte Aligned arrays */ 3714 float ssealignedspace[11],*tmps,*tmpx; 3715 unsigned long offset; 3716 3717 PetscFunctionBegin; 3718 SSE_SCOPE_BEGIN; 3719 3720 offset = (unsigned long)ssealignedspace % 16; 3721 if (offset) offset = (16 - offset)/4; 3722 tmps = &ssealignedspace[offset]; 3723 tmpx = &ssealignedspace[offset+4]; 3724 PREFETCH_NTA(aa+16*ai[1]); 3725 3726 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3727 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3728 t = a->solve_work; 3729 3730 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3731 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3732 3733 /* forward solve the lower triangular */ 3734 idx = 4*(*r++); 3735 t[0] = b[idx]; t[1] = b[1+idx]; 3736 t[2] = b[2+idx]; t[3] = b[3+idx]; 3737 v = aa + 16*ai[1]; 3738 3739 for (i=1; i<n;) { 3740 PREFETCH_NTA(&v[8]); 3741 vi = aj + ai[i]; 3742 nz = diag[i] - ai[i]; 3743 idx = 4*(*r++); 3744 3745 /* Demote sum from double to float */ 3746 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3747 LOAD_PS(tmps,XMM7); 3748 3749 while (nz--) { 3750 PREFETCH_NTA(&v[16]); 3751 idx = 4*(*vi++); 3752 3753 /* Demote solution (so far) from double to float */ 3754 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3755 3756 /* 4x4 Matrix-Vector product with negative accumulation: */ 3757 SSE_INLINE_BEGIN_2(tmpx,v) 3758 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3759 3760 /* First Column */ 3761 SSE_COPY_PS(XMM0,XMM6) 3762 SSE_SHUFFLE(XMM0,XMM0,0x00) 3763 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3764 SSE_SUB_PS(XMM7,XMM0) 3765 3766 /* Second Column */ 3767 SSE_COPY_PS(XMM1,XMM6) 3768 SSE_SHUFFLE(XMM1,XMM1,0x55) 3769 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3770 SSE_SUB_PS(XMM7,XMM1) 3771 3772 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3773 3774 /* Third Column */ 3775 SSE_COPY_PS(XMM2,XMM6) 3776 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3777 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3778 SSE_SUB_PS(XMM7,XMM2) 3779 3780 /* Fourth Column */ 3781 SSE_COPY_PS(XMM3,XMM6) 3782 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3783 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3784 SSE_SUB_PS(XMM7,XMM3) 3785 SSE_INLINE_END_2 3786 3787 v += 16; 3788 } 3789 idx = 4*i; 3790 v = aa + 16*ai[++i]; 3791 PREFETCH_NTA(v); 3792 STORE_PS(tmps,XMM7); 3793 3794 /* Promote result from float to double */ 3795 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3796 } 3797 /* backward solve the upper triangular */ 3798 idt = 4*(n-1); 3799 ai16 = 16*diag[n-1]; 3800 v = aa + ai16 + 16; 3801 for (i=n-1; i>=0;){ 3802 PREFETCH_NTA(&v[8]); 3803 vi = aj + diag[i] + 1; 3804 nz = ai[i+1] - diag[i] - 1; 3805 3806 /* Demote accumulator from double to float */ 3807 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3808 LOAD_PS(tmps,XMM7); 3809 3810 while (nz--) { 3811 PREFETCH_NTA(&v[16]); 3812 idx = 4*(*vi++); 3813 3814 /* Demote solution (so far) from double to float */ 3815 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 3816 3817 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3818 SSE_INLINE_BEGIN_2(tmpx,v) 3819 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3820 3821 /* First Column */ 3822 SSE_COPY_PS(XMM0,XMM6) 3823 SSE_SHUFFLE(XMM0,XMM0,0x00) 3824 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3825 SSE_SUB_PS(XMM7,XMM0) 3826 3827 /* Second Column */ 3828 SSE_COPY_PS(XMM1,XMM6) 3829 SSE_SHUFFLE(XMM1,XMM1,0x55) 3830 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3831 SSE_SUB_PS(XMM7,XMM1) 3832 3833 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3834 3835 /* Third Column */ 3836 SSE_COPY_PS(XMM2,XMM6) 3837 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3838 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3839 SSE_SUB_PS(XMM7,XMM2) 3840 3841 /* Fourth Column */ 3842 SSE_COPY_PS(XMM3,XMM6) 3843 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3844 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3845 SSE_SUB_PS(XMM7,XMM3) 3846 SSE_INLINE_END_2 3847 v += 16; 3848 } 3849 v = aa + ai16; 3850 ai16 = 16*diag[--i]; 3851 PREFETCH_NTA(aa+ai16+16); 3852 /* 3853 Scale the result by the diagonal 4x4 block, 3854 which was inverted as part of the factorization 3855 */ 3856 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 3857 /* First Column */ 3858 SSE_COPY_PS(XMM0,XMM7) 3859 SSE_SHUFFLE(XMM0,XMM0,0x00) 3860 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3861 3862 /* Second Column */ 3863 SSE_COPY_PS(XMM1,XMM7) 3864 SSE_SHUFFLE(XMM1,XMM1,0x55) 3865 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3866 SSE_ADD_PS(XMM0,XMM1) 3867 3868 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3869 3870 /* Third Column */ 3871 SSE_COPY_PS(XMM2,XMM7) 3872 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3873 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3874 SSE_ADD_PS(XMM0,XMM2) 3875 3876 /* Fourth Column */ 3877 SSE_COPY_PS(XMM3,XMM7) 3878 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3879 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3880 SSE_ADD_PS(XMM0,XMM3) 3881 3882 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3883 SSE_INLINE_END_3 3884 3885 /* Promote solution from float to double */ 3886 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 3887 3888 /* Apply reordering to t and stream into x. */ 3889 /* This way, x doesn't pollute the cache. */ 3890 /* Be careful with size: 2 doubles = 4 floats! */ 3891 idc = 4*(*c--); 3892 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 3893 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 3894 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 3895 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 3896 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 3897 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 3898 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 3899 SSE_INLINE_END_2 3900 v = aa + ai16 + 16; 3901 idt -= 4; 3902 } 3903 3904 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3905 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3906 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3907 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3908 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3909 SSE_SCOPE_END; 3910 PetscFunctionReturn(0); 3911 } 3912 3913 #endif 3914 3915 3916 /* 3917 Special case where the matrix was ILU(0) factored in the natural 3918 ordering. This eliminates the need for the column and row permutation. 3919 */ 3920 #undef __FUNCT__ 3921 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 3922 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3923 { 3924 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3925 PetscInt n=a->mbs; 3926 const PetscInt *ai=a->i,*aj=a->j; 3927 PetscErrorCode ierr; 3928 const PetscInt *diag = a->diag; 3929 const MatScalar *aa=a->a; 3930 PetscScalar *x; 3931 const PetscScalar *b; 3932 3933 PetscFunctionBegin; 3934 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3935 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3936 3937 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 3938 { 3939 static PetscScalar w[2000]; /* very BAD need to fix */ 3940 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 3941 } 3942 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 3943 { 3944 static PetscScalar w[2000]; /* very BAD need to fix */ 3945 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 3946 } 3947 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 3948 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3949 #else 3950 { 3951 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3952 const MatScalar *v; 3953 PetscInt jdx,idt,idx,nz,i,ai16; 3954 const PetscInt *vi; 3955 3956 /* forward solve the lower triangular */ 3957 idx = 0; 3958 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 3959 for (i=1; i<n; i++) { 3960 v = aa + 16*ai[i]; 3961 vi = aj + ai[i]; 3962 nz = diag[i] - ai[i]; 3963 idx += 4; 3964 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3965 while (nz--) { 3966 jdx = 4*(*vi++); 3967 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3968 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3969 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3970 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3971 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3972 v += 16; 3973 } 3974 x[idx] = s1; 3975 x[1+idx] = s2; 3976 x[2+idx] = s3; 3977 x[3+idx] = s4; 3978 } 3979 /* backward solve the upper triangular */ 3980 idt = 4*(n-1); 3981 for (i=n-1; i>=0; i--){ 3982 ai16 = 16*diag[i]; 3983 v = aa + ai16 + 16; 3984 vi = aj + diag[i] + 1; 3985 nz = ai[i+1] - diag[i] - 1; 3986 s1 = x[idt]; s2 = x[1+idt]; 3987 s3 = x[2+idt];s4 = x[3+idt]; 3988 while (nz--) { 3989 idx = 4*(*vi++); 3990 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3991 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3992 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3993 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3994 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3995 v += 16; 3996 } 3997 v = aa + ai16; 3998 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3999 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4000 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4001 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4002 idt -= 4; 4003 } 4004 } 4005 #endif 4006 4007 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4008 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4009 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4010 PetscFunctionReturn(0); 4011 } 4012 4013 #undef __FUNCT__ 4014 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 4015 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4016 { 4017 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4018 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4019 PetscErrorCode ierr; 4020 PetscInt idx,jdx,idt; 4021 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4022 const MatScalar *aa=a->a,*v; 4023 PetscScalar *x; 4024 const PetscScalar *b; 4025 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4026 4027 PetscFunctionBegin; 4028 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4029 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4030 /* forward solve the lower triangular */ 4031 idx = 0; 4032 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4033 for (i=1; i<n; i++) { 4034 v = aa + bs2*ai[i]; 4035 vi = aj + ai[i]; 4036 nz = ai[i+1] - ai[i]; 4037 idx = bs*i; 4038 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4039 for(k=0;k<nz;k++) { 4040 jdx = bs*vi[k]; 4041 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4042 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4043 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4044 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4045 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4046 4047 v += bs2; 4048 } 4049 4050 x[idx] = s1; 4051 x[1+idx] = s2; 4052 x[2+idx] = s3; 4053 x[3+idx] = s4; 4054 } 4055 4056 /* backward solve the upper triangular */ 4057 for (i=n-1; i>=0; i--){ 4058 v = aa + bs2*(adiag[i+1]+1); 4059 vi = aj + adiag[i+1]+1; 4060 nz = adiag[i] - adiag[i+1]-1; 4061 idt = bs*i; 4062 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4063 4064 for(k=0;k<nz;k++){ 4065 idx = bs*vi[k]; 4066 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4067 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4068 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4069 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4070 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4071 4072 v += bs2; 4073 } 4074 /* x = inv_diagonal*x */ 4075 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4076 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4077 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4078 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4079 4080 } 4081 4082 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4083 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4084 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4085 PetscFunctionReturn(0); 4086 } 4087 4088 #undef __FUNCT__ 4089 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4090 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4091 { 4092 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4093 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4094 PetscErrorCode ierr; 4095 PetscInt *diag = a->diag; 4096 MatScalar *aa=a->a; 4097 PetscScalar *x,*b; 4098 4099 PetscFunctionBegin; 4100 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4101 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4102 4103 { 4104 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4105 MatScalar *v,*t=(MatScalar *)x; 4106 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 4107 4108 /* forward solve the lower triangular */ 4109 idx = 0; 4110 t[0] = (MatScalar)b[0]; 4111 t[1] = (MatScalar)b[1]; 4112 t[2] = (MatScalar)b[2]; 4113 t[3] = (MatScalar)b[3]; 4114 for (i=1; i<n; i++) { 4115 v = aa + 16*ai[i]; 4116 vi = aj + ai[i]; 4117 nz = diag[i] - ai[i]; 4118 idx += 4; 4119 s1 = (MatScalar)b[idx]; 4120 s2 = (MatScalar)b[1+idx]; 4121 s3 = (MatScalar)b[2+idx]; 4122 s4 = (MatScalar)b[3+idx]; 4123 while (nz--) { 4124 jdx = 4*(*vi++); 4125 x1 = t[jdx]; 4126 x2 = t[1+jdx]; 4127 x3 = t[2+jdx]; 4128 x4 = t[3+jdx]; 4129 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4130 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4131 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4132 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4133 v += 16; 4134 } 4135 t[idx] = s1; 4136 t[1+idx] = s2; 4137 t[2+idx] = s3; 4138 t[3+idx] = s4; 4139 } 4140 /* backward solve the upper triangular */ 4141 idt = 4*(n-1); 4142 for (i=n-1; i>=0; i--){ 4143 ai16 = 16*diag[i]; 4144 v = aa + ai16 + 16; 4145 vi = aj + diag[i] + 1; 4146 nz = ai[i+1] - diag[i] - 1; 4147 s1 = t[idt]; 4148 s2 = t[1+idt]; 4149 s3 = t[2+idt]; 4150 s4 = t[3+idt]; 4151 while (nz--) { 4152 idx = 4*(*vi++); 4153 x1 = (MatScalar)x[idx]; 4154 x2 = (MatScalar)x[1+idx]; 4155 x3 = (MatScalar)x[2+idx]; 4156 x4 = (MatScalar)x[3+idx]; 4157 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4158 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4159 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4160 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4161 v += 16; 4162 } 4163 v = aa + ai16; 4164 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4165 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4166 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4167 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4168 idt -= 4; 4169 } 4170 } 4171 4172 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4173 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4174 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4175 PetscFunctionReturn(0); 4176 } 4177 4178 #if defined (PETSC_HAVE_SSE) 4179 4180 #include PETSC_HAVE_SSE 4181 #undef __FUNCT__ 4182 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4183 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 4184 { 4185 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4186 unsigned short *aj=(unsigned short *)a->j; 4187 PetscErrorCode ierr; 4188 int *ai=a->i,n=a->mbs,*diag = a->diag; 4189 MatScalar *aa=a->a; 4190 PetscScalar *x,*b; 4191 4192 PetscFunctionBegin; 4193 SSE_SCOPE_BEGIN; 4194 /* 4195 Note: This code currently uses demotion of double 4196 to float when performing the mixed-mode computation. 4197 This may not be numerically reasonable for all applications. 4198 */ 4199 PREFETCH_NTA(aa+16*ai[1]); 4200 4201 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4202 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4203 { 4204 /* x will first be computed in single precision then promoted inplace to double */ 4205 MatScalar *v,*t=(MatScalar *)x; 4206 int nz,i,idt,ai16; 4207 unsigned int jdx,idx; 4208 unsigned short *vi; 4209 /* Forward solve the lower triangular factor. */ 4210 4211 /* First block is the identity. */ 4212 idx = 0; 4213 CONVERT_DOUBLE4_FLOAT4(t,b); 4214 v = aa + 16*((unsigned int)ai[1]); 4215 4216 for (i=1; i<n;) { 4217 PREFETCH_NTA(&v[8]); 4218 vi = aj + ai[i]; 4219 nz = diag[i] - ai[i]; 4220 idx += 4; 4221 4222 /* Demote RHS from double to float. */ 4223 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4224 LOAD_PS(&t[idx],XMM7); 4225 4226 while (nz--) { 4227 PREFETCH_NTA(&v[16]); 4228 jdx = 4*((unsigned int)(*vi++)); 4229 4230 /* 4x4 Matrix-Vector product with negative accumulation: */ 4231 SSE_INLINE_BEGIN_2(&t[jdx],v) 4232 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4233 4234 /* First Column */ 4235 SSE_COPY_PS(XMM0,XMM6) 4236 SSE_SHUFFLE(XMM0,XMM0,0x00) 4237 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4238 SSE_SUB_PS(XMM7,XMM0) 4239 4240 /* Second Column */ 4241 SSE_COPY_PS(XMM1,XMM6) 4242 SSE_SHUFFLE(XMM1,XMM1,0x55) 4243 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4244 SSE_SUB_PS(XMM7,XMM1) 4245 4246 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4247 4248 /* Third Column */ 4249 SSE_COPY_PS(XMM2,XMM6) 4250 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4251 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4252 SSE_SUB_PS(XMM7,XMM2) 4253 4254 /* Fourth Column */ 4255 SSE_COPY_PS(XMM3,XMM6) 4256 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4257 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4258 SSE_SUB_PS(XMM7,XMM3) 4259 SSE_INLINE_END_2 4260 4261 v += 16; 4262 } 4263 v = aa + 16*ai[++i]; 4264 PREFETCH_NTA(v); 4265 STORE_PS(&t[idx],XMM7); 4266 } 4267 4268 /* Backward solve the upper triangular factor.*/ 4269 4270 idt = 4*(n-1); 4271 ai16 = 16*diag[n-1]; 4272 v = aa + ai16 + 16; 4273 for (i=n-1; i>=0;){ 4274 PREFETCH_NTA(&v[8]); 4275 vi = aj + diag[i] + 1; 4276 nz = ai[i+1] - diag[i] - 1; 4277 4278 LOAD_PS(&t[idt],XMM7); 4279 4280 while (nz--) { 4281 PREFETCH_NTA(&v[16]); 4282 idx = 4*((unsigned int)(*vi++)); 4283 4284 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4285 SSE_INLINE_BEGIN_2(&t[idx],v) 4286 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4287 4288 /* First Column */ 4289 SSE_COPY_PS(XMM0,XMM6) 4290 SSE_SHUFFLE(XMM0,XMM0,0x00) 4291 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4292 SSE_SUB_PS(XMM7,XMM0) 4293 4294 /* Second Column */ 4295 SSE_COPY_PS(XMM1,XMM6) 4296 SSE_SHUFFLE(XMM1,XMM1,0x55) 4297 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4298 SSE_SUB_PS(XMM7,XMM1) 4299 4300 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4301 4302 /* Third Column */ 4303 SSE_COPY_PS(XMM2,XMM6) 4304 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4305 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4306 SSE_SUB_PS(XMM7,XMM2) 4307 4308 /* Fourth Column */ 4309 SSE_COPY_PS(XMM3,XMM6) 4310 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4311 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4312 SSE_SUB_PS(XMM7,XMM3) 4313 SSE_INLINE_END_2 4314 v += 16; 4315 } 4316 v = aa + ai16; 4317 ai16 = 16*diag[--i]; 4318 PREFETCH_NTA(aa+ai16+16); 4319 /* 4320 Scale the result by the diagonal 4x4 block, 4321 which was inverted as part of the factorization 4322 */ 4323 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4324 /* First Column */ 4325 SSE_COPY_PS(XMM0,XMM7) 4326 SSE_SHUFFLE(XMM0,XMM0,0x00) 4327 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4328 4329 /* Second Column */ 4330 SSE_COPY_PS(XMM1,XMM7) 4331 SSE_SHUFFLE(XMM1,XMM1,0x55) 4332 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4333 SSE_ADD_PS(XMM0,XMM1) 4334 4335 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4336 4337 /* Third Column */ 4338 SSE_COPY_PS(XMM2,XMM7) 4339 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4340 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4341 SSE_ADD_PS(XMM0,XMM2) 4342 4343 /* Fourth Column */ 4344 SSE_COPY_PS(XMM3,XMM7) 4345 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4346 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4347 SSE_ADD_PS(XMM0,XMM3) 4348 4349 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4350 SSE_INLINE_END_3 4351 4352 v = aa + ai16 + 16; 4353 idt -= 4; 4354 } 4355 4356 /* Convert t from single precision back to double precision (inplace)*/ 4357 idt = 4*(n-1); 4358 for (i=n-1;i>=0;i--) { 4359 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4360 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4361 PetscScalar *xtemp=&x[idt]; 4362 MatScalar *ttemp=&t[idt]; 4363 xtemp[3] = (PetscScalar)ttemp[3]; 4364 xtemp[2] = (PetscScalar)ttemp[2]; 4365 xtemp[1] = (PetscScalar)ttemp[1]; 4366 xtemp[0] = (PetscScalar)ttemp[0]; 4367 idt -= 4; 4368 } 4369 4370 } /* End of artificial scope. */ 4371 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4372 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4373 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4374 SSE_SCOPE_END; 4375 PetscFunctionReturn(0); 4376 } 4377 4378 #undef __FUNCT__ 4379 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4380 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 4381 { 4382 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4383 int *aj=a->j; 4384 PetscErrorCode ierr; 4385 int *ai=a->i,n=a->mbs,*diag = a->diag; 4386 MatScalar *aa=a->a; 4387 PetscScalar *x,*b; 4388 4389 PetscFunctionBegin; 4390 SSE_SCOPE_BEGIN; 4391 /* 4392 Note: This code currently uses demotion of double 4393 to float when performing the mixed-mode computation. 4394 This may not be numerically reasonable for all applications. 4395 */ 4396 PREFETCH_NTA(aa+16*ai[1]); 4397 4398 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4399 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4400 { 4401 /* x will first be computed in single precision then promoted inplace to double */ 4402 MatScalar *v,*t=(MatScalar *)x; 4403 int nz,i,idt,ai16; 4404 int jdx,idx; 4405 int *vi; 4406 /* Forward solve the lower triangular factor. */ 4407 4408 /* First block is the identity. */ 4409 idx = 0; 4410 CONVERT_DOUBLE4_FLOAT4(t,b); 4411 v = aa + 16*ai[1]; 4412 4413 for (i=1; i<n;) { 4414 PREFETCH_NTA(&v[8]); 4415 vi = aj + ai[i]; 4416 nz = diag[i] - ai[i]; 4417 idx += 4; 4418 4419 /* Demote RHS from double to float. */ 4420 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4421 LOAD_PS(&t[idx],XMM7); 4422 4423 while (nz--) { 4424 PREFETCH_NTA(&v[16]); 4425 jdx = 4*(*vi++); 4426 /* jdx = *vi++; */ 4427 4428 /* 4x4 Matrix-Vector product with negative accumulation: */ 4429 SSE_INLINE_BEGIN_2(&t[jdx],v) 4430 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4431 4432 /* First Column */ 4433 SSE_COPY_PS(XMM0,XMM6) 4434 SSE_SHUFFLE(XMM0,XMM0,0x00) 4435 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4436 SSE_SUB_PS(XMM7,XMM0) 4437 4438 /* Second Column */ 4439 SSE_COPY_PS(XMM1,XMM6) 4440 SSE_SHUFFLE(XMM1,XMM1,0x55) 4441 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4442 SSE_SUB_PS(XMM7,XMM1) 4443 4444 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4445 4446 /* Third Column */ 4447 SSE_COPY_PS(XMM2,XMM6) 4448 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4449 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4450 SSE_SUB_PS(XMM7,XMM2) 4451 4452 /* Fourth Column */ 4453 SSE_COPY_PS(XMM3,XMM6) 4454 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4455 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4456 SSE_SUB_PS(XMM7,XMM3) 4457 SSE_INLINE_END_2 4458 4459 v += 16; 4460 } 4461 v = aa + 16*ai[++i]; 4462 PREFETCH_NTA(v); 4463 STORE_PS(&t[idx],XMM7); 4464 } 4465 4466 /* Backward solve the upper triangular factor.*/ 4467 4468 idt = 4*(n-1); 4469 ai16 = 16*diag[n-1]; 4470 v = aa + ai16 + 16; 4471 for (i=n-1; i>=0;){ 4472 PREFETCH_NTA(&v[8]); 4473 vi = aj + diag[i] + 1; 4474 nz = ai[i+1] - diag[i] - 1; 4475 4476 LOAD_PS(&t[idt],XMM7); 4477 4478 while (nz--) { 4479 PREFETCH_NTA(&v[16]); 4480 idx = 4*(*vi++); 4481 /* idx = *vi++; */ 4482 4483 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4484 SSE_INLINE_BEGIN_2(&t[idx],v) 4485 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4486 4487 /* First Column */ 4488 SSE_COPY_PS(XMM0,XMM6) 4489 SSE_SHUFFLE(XMM0,XMM0,0x00) 4490 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4491 SSE_SUB_PS(XMM7,XMM0) 4492 4493 /* Second Column */ 4494 SSE_COPY_PS(XMM1,XMM6) 4495 SSE_SHUFFLE(XMM1,XMM1,0x55) 4496 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4497 SSE_SUB_PS(XMM7,XMM1) 4498 4499 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4500 4501 /* Third Column */ 4502 SSE_COPY_PS(XMM2,XMM6) 4503 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4504 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4505 SSE_SUB_PS(XMM7,XMM2) 4506 4507 /* Fourth Column */ 4508 SSE_COPY_PS(XMM3,XMM6) 4509 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4510 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4511 SSE_SUB_PS(XMM7,XMM3) 4512 SSE_INLINE_END_2 4513 v += 16; 4514 } 4515 v = aa + ai16; 4516 ai16 = 16*diag[--i]; 4517 PREFETCH_NTA(aa+ai16+16); 4518 /* 4519 Scale the result by the diagonal 4x4 block, 4520 which was inverted as part of the factorization 4521 */ 4522 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4523 /* First Column */ 4524 SSE_COPY_PS(XMM0,XMM7) 4525 SSE_SHUFFLE(XMM0,XMM0,0x00) 4526 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4527 4528 /* Second Column */ 4529 SSE_COPY_PS(XMM1,XMM7) 4530 SSE_SHUFFLE(XMM1,XMM1,0x55) 4531 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4532 SSE_ADD_PS(XMM0,XMM1) 4533 4534 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4535 4536 /* Third Column */ 4537 SSE_COPY_PS(XMM2,XMM7) 4538 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4539 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4540 SSE_ADD_PS(XMM0,XMM2) 4541 4542 /* Fourth Column */ 4543 SSE_COPY_PS(XMM3,XMM7) 4544 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4545 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4546 SSE_ADD_PS(XMM0,XMM3) 4547 4548 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4549 SSE_INLINE_END_3 4550 4551 v = aa + ai16 + 16; 4552 idt -= 4; 4553 } 4554 4555 /* Convert t from single precision back to double precision (inplace)*/ 4556 idt = 4*(n-1); 4557 for (i=n-1;i>=0;i--) { 4558 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4559 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4560 PetscScalar *xtemp=&x[idt]; 4561 MatScalar *ttemp=&t[idt]; 4562 xtemp[3] = (PetscScalar)ttemp[3]; 4563 xtemp[2] = (PetscScalar)ttemp[2]; 4564 xtemp[1] = (PetscScalar)ttemp[1]; 4565 xtemp[0] = (PetscScalar)ttemp[0]; 4566 idt -= 4; 4567 } 4568 4569 } /* End of artificial scope. */ 4570 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4571 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4572 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4573 SSE_SCOPE_END; 4574 PetscFunctionReturn(0); 4575 } 4576 4577 #endif 4578 4579 #undef __FUNCT__ 4580 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 4581 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 4582 { 4583 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4584 IS iscol=a->col,isrow=a->row; 4585 PetscErrorCode ierr; 4586 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4587 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4588 const MatScalar *aa=a->a,*v; 4589 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4590 const PetscScalar *b; 4591 4592 PetscFunctionBegin; 4593 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4594 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4595 t = a->solve_work; 4596 4597 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4598 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4599 4600 /* forward solve the lower triangular */ 4601 idx = 3*(*r++); 4602 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4603 for (i=1; i<n; i++) { 4604 v = aa + 9*ai[i]; 4605 vi = aj + ai[i]; 4606 nz = diag[i] - ai[i]; 4607 idx = 3*(*r++); 4608 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4609 while (nz--) { 4610 idx = 3*(*vi++); 4611 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4612 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4613 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4614 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4615 v += 9; 4616 } 4617 idx = 3*i; 4618 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4619 } 4620 /* backward solve the upper triangular */ 4621 for (i=n-1; i>=0; i--){ 4622 v = aa + 9*diag[i] + 9; 4623 vi = aj + diag[i] + 1; 4624 nz = ai[i+1] - diag[i] - 1; 4625 idt = 3*i; 4626 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4627 while (nz--) { 4628 idx = 3*(*vi++); 4629 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4630 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4631 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4632 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4633 v += 9; 4634 } 4635 idc = 3*(*c--); 4636 v = aa + 9*diag[i]; 4637 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4638 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4639 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4640 } 4641 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4642 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4643 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4644 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4645 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4646 PetscFunctionReturn(0); 4647 } 4648 4649 #undef __FUNCT__ 4650 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4651 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 4652 { 4653 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4654 IS iscol=a->col,isrow=a->row; 4655 PetscErrorCode ierr; 4656 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 4657 const PetscInt *r,*c,*rout,*cout; 4658 const MatScalar *aa=a->a,*v; 4659 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4660 const PetscScalar *b; 4661 4662 PetscFunctionBegin; 4663 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4664 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4665 t = a->solve_work; 4666 4667 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4668 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4669 4670 /* forward solve the lower triangular */ 4671 idx = 3*r[0]; 4672 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4673 for (i=1; i<n; i++) { 4674 v = aa + 9*ai[i]; 4675 vi = aj + ai[i]; 4676 nz = ai[i+1] - ai[i]; 4677 idx = 3*r[i]; 4678 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4679 for(m=0;m<nz;m++){ 4680 idx = 3*vi[m]; 4681 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4682 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4683 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4684 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4685 v += 9; 4686 } 4687 idx = 3*i; 4688 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4689 } 4690 /* backward solve the upper triangular */ 4691 for (i=n-1; i>=0; i--){ 4692 v = aa + 9*(adiag[i+1]+1); 4693 vi = aj + adiag[i+1]+1; 4694 nz = adiag[i] - adiag[i+1] - 1; 4695 idt = 3*i; 4696 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4697 for(m=0;m<nz;m++){ 4698 idx = 3*vi[m]; 4699 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4700 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4701 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4702 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4703 v += 9; 4704 } 4705 idc = 3*c[i]; 4706 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4707 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4708 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4709 } 4710 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4711 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4712 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4713 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4714 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4715 PetscFunctionReturn(0); 4716 } 4717 4718 /* 4719 Special case where the matrix was ILU(0) factored in the natural 4720 ordering. This eliminates the need for the column and row permutation. 4721 */ 4722 #undef __FUNCT__ 4723 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 4724 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4725 { 4726 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4727 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4728 PetscErrorCode ierr; 4729 PetscInt *diag = a->diag; 4730 const MatScalar *aa=a->a,*v; 4731 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4732 const PetscScalar *b; 4733 PetscInt jdx,idt,idx,nz,*vi,i; 4734 4735 PetscFunctionBegin; 4736 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4737 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4738 4739 /* forward solve the lower triangular */ 4740 idx = 0; 4741 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4742 for (i=1; i<n; i++) { 4743 v = aa + 9*ai[i]; 4744 vi = aj + ai[i]; 4745 nz = diag[i] - ai[i]; 4746 idx += 3; 4747 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4748 while (nz--) { 4749 jdx = 3*(*vi++); 4750 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4751 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4752 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4753 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4754 v += 9; 4755 } 4756 x[idx] = s1; 4757 x[1+idx] = s2; 4758 x[2+idx] = s3; 4759 } 4760 /* backward solve the upper triangular */ 4761 for (i=n-1; i>=0; i--){ 4762 v = aa + 9*diag[i] + 9; 4763 vi = aj + diag[i] + 1; 4764 nz = ai[i+1] - diag[i] - 1; 4765 idt = 3*i; 4766 s1 = x[idt]; s2 = x[1+idt]; 4767 s3 = x[2+idt]; 4768 while (nz--) { 4769 idx = 3*(*vi++); 4770 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4771 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4772 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4773 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4774 v += 9; 4775 } 4776 v = aa + 9*diag[i]; 4777 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4778 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4779 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4780 } 4781 4782 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4783 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4784 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4785 PetscFunctionReturn(0); 4786 } 4787 4788 #undef __FUNCT__ 4789 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4790 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4791 { 4792 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4793 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4794 PetscErrorCode ierr; 4795 PetscInt idx,jdx,idt; 4796 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4797 const MatScalar *aa=a->a,*v; 4798 PetscScalar *x; 4799 const PetscScalar *b; 4800 PetscScalar s1,s2,s3,x1,x2,x3; 4801 4802 PetscFunctionBegin; 4803 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4804 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4805 /* forward solve the lower triangular */ 4806 idx = 0; 4807 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4808 for (i=1; i<n; i++) { 4809 v = aa + bs2*ai[i]; 4810 vi = aj + ai[i]; 4811 nz = ai[i+1] - ai[i]; 4812 idx = bs*i; 4813 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4814 for(k=0;k<nz;k++){ 4815 jdx = bs*vi[k]; 4816 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4817 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4818 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4819 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4820 4821 v += bs2; 4822 } 4823 4824 x[idx] = s1; 4825 x[1+idx] = s2; 4826 x[2+idx] = s3; 4827 } 4828 4829 /* backward solve the upper triangular */ 4830 for (i=n-1; i>=0; i--){ 4831 v = aa + bs2*(adiag[i+1]+1); 4832 vi = aj + adiag[i+1]+1; 4833 nz = adiag[i] - adiag[i+1]-1; 4834 idt = bs*i; 4835 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4836 4837 for(k=0;k<nz;k++){ 4838 idx = bs*vi[k]; 4839 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4840 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4841 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4842 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4843 4844 v += bs2; 4845 } 4846 /* x = inv_diagonal*x */ 4847 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4848 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4849 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4850 4851 } 4852 4853 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4854 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4855 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4856 PetscFunctionReturn(0); 4857 } 4858 4859 #undef __FUNCT__ 4860 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 4861 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 4862 { 4863 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4864 IS iscol=a->col,isrow=a->row; 4865 PetscErrorCode ierr; 4866 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4867 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4868 const MatScalar *aa=a->a,*v; 4869 PetscScalar *x,s1,s2,x1,x2,*t; 4870 const PetscScalar *b; 4871 4872 PetscFunctionBegin; 4873 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4874 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4875 t = a->solve_work; 4876 4877 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4878 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4879 4880 /* forward solve the lower triangular */ 4881 idx = 2*(*r++); 4882 t[0] = b[idx]; t[1] = b[1+idx]; 4883 for (i=1; i<n; i++) { 4884 v = aa + 4*ai[i]; 4885 vi = aj + ai[i]; 4886 nz = diag[i] - ai[i]; 4887 idx = 2*(*r++); 4888 s1 = b[idx]; s2 = b[1+idx]; 4889 while (nz--) { 4890 idx = 2*(*vi++); 4891 x1 = t[idx]; x2 = t[1+idx]; 4892 s1 -= v[0]*x1 + v[2]*x2; 4893 s2 -= v[1]*x1 + v[3]*x2; 4894 v += 4; 4895 } 4896 idx = 2*i; 4897 t[idx] = s1; t[1+idx] = s2; 4898 } 4899 /* backward solve the upper triangular */ 4900 for (i=n-1; i>=0; i--){ 4901 v = aa + 4*diag[i] + 4; 4902 vi = aj + diag[i] + 1; 4903 nz = ai[i+1] - diag[i] - 1; 4904 idt = 2*i; 4905 s1 = t[idt]; s2 = t[1+idt]; 4906 while (nz--) { 4907 idx = 2*(*vi++); 4908 x1 = t[idx]; x2 = t[1+idx]; 4909 s1 -= v[0]*x1 + v[2]*x2; 4910 s2 -= v[1]*x1 + v[3]*x2; 4911 v += 4; 4912 } 4913 idc = 2*(*c--); 4914 v = aa + 4*diag[i]; 4915 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4916 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4917 } 4918 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4919 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4920 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4921 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4922 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4923 PetscFunctionReturn(0); 4924 } 4925 4926 #undef __FUNCT__ 4927 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4928 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 4929 { 4930 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4931 IS iscol=a->col,isrow=a->row; 4932 PetscErrorCode ierr; 4933 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 4934 const PetscInt *r,*c,*rout,*cout; 4935 const MatScalar *aa=a->a,*v; 4936 PetscScalar *x,s1,s2,x1,x2,*t; 4937 const PetscScalar *b; 4938 4939 PetscFunctionBegin; 4940 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4941 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4942 t = a->solve_work; 4943 4944 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4945 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4946 4947 /* forward solve the lower triangular */ 4948 idx = 2*r[0]; 4949 t[0] = b[idx]; t[1] = b[1+idx]; 4950 for (i=1; i<n; i++) { 4951 v = aa + 4*ai[i]; 4952 vi = aj + ai[i]; 4953 nz = ai[i+1] - ai[i]; 4954 idx = 2*r[i]; 4955 s1 = b[idx]; s2 = b[1+idx]; 4956 for(m=0;m<nz;m++){ 4957 jdx = 2*vi[m]; 4958 x1 = t[jdx]; x2 = t[1+jdx]; 4959 s1 -= v[0]*x1 + v[2]*x2; 4960 s2 -= v[1]*x1 + v[3]*x2; 4961 v += 4; 4962 } 4963 idx = 2*i; 4964 t[idx] = s1; t[1+idx] = s2; 4965 } 4966 /* backward solve the upper triangular */ 4967 for (i=n-1; i>=0; i--){ 4968 v = aa + 4*(adiag[i+1]+1); 4969 vi = aj + adiag[i+1]+1; 4970 nz = adiag[i] - adiag[i+1] - 1; 4971 idt = 2*i; 4972 s1 = t[idt]; s2 = t[1+idt]; 4973 for(m=0;m<nz;m++){ 4974 idx = 2*vi[m]; 4975 x1 = t[idx]; x2 = t[1+idx]; 4976 s1 -= v[0]*x1 + v[2]*x2; 4977 s2 -= v[1]*x1 + v[3]*x2; 4978 v += 4; 4979 } 4980 idc = 2*c[i]; 4981 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4982 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4983 } 4984 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4985 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4986 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4987 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4988 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4989 PetscFunctionReturn(0); 4990 } 4991 4992 /* 4993 Special case where the matrix was ILU(0) factored in the natural 4994 ordering. This eliminates the need for the column and row permutation. 4995 */ 4996 #undef __FUNCT__ 4997 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 4998 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4999 { 5000 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5001 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 5002 PetscErrorCode ierr; 5003 PetscInt *diag = a->diag; 5004 const MatScalar *aa=a->a,*v; 5005 PetscScalar *x,s1,s2,x1,x2; 5006 const PetscScalar *b; 5007 PetscInt jdx,idt,idx,nz,*vi,i; 5008 5009 PetscFunctionBegin; 5010 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5011 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5012 5013 /* forward solve the lower triangular */ 5014 idx = 0; 5015 x[0] = b[0]; x[1] = b[1]; 5016 for (i=1; i<n; i++) { 5017 v = aa + 4*ai[i]; 5018 vi = aj + ai[i]; 5019 nz = diag[i] - ai[i]; 5020 idx += 2; 5021 s1 = b[idx];s2 = b[1+idx]; 5022 while (nz--) { 5023 jdx = 2*(*vi++); 5024 x1 = x[jdx];x2 = x[1+jdx]; 5025 s1 -= v[0]*x1 + v[2]*x2; 5026 s2 -= v[1]*x1 + v[3]*x2; 5027 v += 4; 5028 } 5029 x[idx] = s1; 5030 x[1+idx] = s2; 5031 } 5032 /* backward solve the upper triangular */ 5033 for (i=n-1; i>=0; i--){ 5034 v = aa + 4*diag[i] + 4; 5035 vi = aj + diag[i] + 1; 5036 nz = ai[i+1] - diag[i] - 1; 5037 idt = 2*i; 5038 s1 = x[idt]; s2 = x[1+idt]; 5039 while (nz--) { 5040 idx = 2*(*vi++); 5041 x1 = x[idx]; x2 = x[1+idx]; 5042 s1 -= v[0]*x1 + v[2]*x2; 5043 s2 -= v[1]*x1 + v[3]*x2; 5044 v += 4; 5045 } 5046 v = aa + 4*diag[i]; 5047 x[idt] = v[0]*s1 + v[2]*s2; 5048 x[1+idt] = v[1]*s1 + v[3]*s2; 5049 } 5050 5051 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5052 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5053 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5054 PetscFunctionReturn(0); 5055 } 5056 5057 #undef __FUNCT__ 5058 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 5059 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5060 { 5061 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5062 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 5063 PetscErrorCode ierr; 5064 PetscInt jdx; 5065 const MatScalar *aa=a->a,*v; 5066 PetscScalar *x,s1,s2,x1,x2; 5067 const PetscScalar *b; 5068 5069 PetscFunctionBegin; 5070 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5071 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5072 /* forward solve the lower triangular */ 5073 idx = 0; 5074 x[0] = b[idx]; x[1] = b[1+idx]; 5075 for (i=1; i<n; i++) { 5076 v = aa + 4*ai[i]; 5077 vi = aj + ai[i]; 5078 nz = ai[i+1] - ai[i]; 5079 idx = 2*i; 5080 s1 = b[idx];s2 = b[1+idx]; 5081 for(k=0;k<nz;k++){ 5082 jdx = 2*vi[k]; 5083 x1 = x[jdx];x2 = x[1+jdx]; 5084 s1 -= v[0]*x1 + v[2]*x2; 5085 s2 -= v[1]*x1 + v[3]*x2; 5086 v += 4; 5087 } 5088 x[idx] = s1; 5089 x[1+idx] = s2; 5090 } 5091 5092 /* backward solve the upper triangular */ 5093 for (i=n-1; i>=0; i--){ 5094 v = aa + 4*(adiag[i+1]+1); 5095 vi = aj + adiag[i+1]+1; 5096 nz = adiag[i] - adiag[i+1]-1; 5097 idt = 2*i; 5098 s1 = x[idt]; s2 = x[1+idt]; 5099 for(k=0;k<nz;k++){ 5100 idx = 2*vi[k]; 5101 x1 = x[idx]; x2 = x[1+idx]; 5102 s1 -= v[0]*x1 + v[2]*x2; 5103 s2 -= v[1]*x1 + v[3]*x2; 5104 v += 4; 5105 } 5106 /* x = inv_diagonal*x */ 5107 x[idt] = v[0]*s1 + v[2]*s2; 5108 x[1+idt] = v[1]*s1 + v[3]*s2; 5109 } 5110 5111 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5113 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5114 PetscFunctionReturn(0); 5115 } 5116 5117 #undef __FUNCT__ 5118 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 5119 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 5120 { 5121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5122 IS iscol=a->col,isrow=a->row; 5123 PetscErrorCode ierr; 5124 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 5125 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5126 MatScalar *aa=a->a,*v; 5127 PetscScalar *x,*b,s1,*t; 5128 5129 PetscFunctionBegin; 5130 if (!n) PetscFunctionReturn(0); 5131 5132 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5133 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5134 t = a->solve_work; 5135 5136 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5137 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5138 5139 /* forward solve the lower triangular */ 5140 t[0] = b[*r++]; 5141 for (i=1; i<n; i++) { 5142 v = aa + ai[i]; 5143 vi = aj + ai[i]; 5144 nz = diag[i] - ai[i]; 5145 s1 = b[*r++]; 5146 while (nz--) { 5147 s1 -= (*v++)*t[*vi++]; 5148 } 5149 t[i] = s1; 5150 } 5151 /* backward solve the upper triangular */ 5152 for (i=n-1; i>=0; i--){ 5153 v = aa + diag[i] + 1; 5154 vi = aj + diag[i] + 1; 5155 nz = ai[i+1] - diag[i] - 1; 5156 s1 = t[i]; 5157 while (nz--) { 5158 s1 -= (*v++)*t[*vi++]; 5159 } 5160 x[*c--] = t[i] = aa[diag[i]]*s1; 5161 } 5162 5163 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5164 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5165 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5166 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5167 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5168 PetscFunctionReturn(0); 5169 } 5170 /* 5171 Special case where the matrix was ILU(0) factored in the natural 5172 ordering. This eliminates the need for the column and row permutation. 5173 */ 5174 #undef __FUNCT__ 5175 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 5176 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5177 { 5178 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5179 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 5180 PetscErrorCode ierr; 5181 PetscInt *diag = a->diag; 5182 MatScalar *aa=a->a; 5183 PetscScalar *x,*b; 5184 PetscScalar s1,x1; 5185 MatScalar *v; 5186 PetscInt jdx,idt,idx,nz,*vi,i; 5187 5188 PetscFunctionBegin; 5189 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5190 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5191 5192 /* forward solve the lower triangular */ 5193 idx = 0; 5194 x[0] = b[0]; 5195 for (i=1; i<n; i++) { 5196 v = aa + ai[i]; 5197 vi = aj + ai[i]; 5198 nz = diag[i] - ai[i]; 5199 idx += 1; 5200 s1 = b[idx]; 5201 while (nz--) { 5202 jdx = *vi++; 5203 x1 = x[jdx]; 5204 s1 -= v[0]*x1; 5205 v += 1; 5206 } 5207 x[idx] = s1; 5208 } 5209 /* backward solve the upper triangular */ 5210 for (i=n-1; i>=0; i--){ 5211 v = aa + diag[i] + 1; 5212 vi = aj + diag[i] + 1; 5213 nz = ai[i+1] - diag[i] - 1; 5214 idt = i; 5215 s1 = x[idt]; 5216 while (nz--) { 5217 idx = *vi++; 5218 x1 = x[idx]; 5219 s1 -= v[0]*x1; 5220 v += 1; 5221 } 5222 v = aa + diag[i]; 5223 x[idt] = v[0]*s1; 5224 } 5225 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5226 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5227 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5228 PetscFunctionReturn(0); 5229 } 5230 5231 /* ----------------------------------------------------------------*/ 5232 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 5233 //EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_inplace(Mat,PetscTruth); 5234 //EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 5235 5236 /* bs = 15 for PFLOTRAN */ 5237 #undef __FUNCT__ 5238 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15" 5239 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15(Mat B,Mat A,const MatFactorInfo *info) 5240 { 5241 Mat C=B; 5242 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5243 IS isrow = b->row,isicol = b->icol; 5244 PetscErrorCode ierr; 5245 const PetscInt *r,*ic,*ics; 5246 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5247 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj,*v_pivots; 5248 MatScalar *rtmp,*pc,*mwork,*v,*v_work,*pv,*aa=a->a; 5249 PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 5250 PetscReal shift = info->shiftinblocks; 5251 5252 PetscFunctionBegin; 5253 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5254 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5255 5256 5257 /* generate work space needed by the factorization */ 5258 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 5259 ierr = PetscMalloc2(bs,MatScalar,&v_work,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5260 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5261 ics = ic; 5262 5263 for (i=0; i<n; i++){ 5264 /* zero rtmp */ 5265 /* L part */ 5266 nz = bi[i+1] - bi[i]; 5267 bjtmp = bj + bi[i]; 5268 for (j=0; j<nz; j++){ 5269 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5270 } 5271 5272 /* U part */ 5273 nz = bdiag[i] - bdiag[i+1]; 5274 bjtmp = bj + bdiag[i+1]+1; 5275 for (j=0; j<nz; j++){ 5276 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5277 } 5278 5279 /* load in initial (unfactored row) */ 5280 nz = ai[r[i]+1] - ai[r[i]]; 5281 ajtmp = aj + ai[r[i]]; 5282 v = aa + bs2*ai[r[i]]; 5283 for (j=0; j<nz; j++) { 5284 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5285 } 5286 5287 /* elimination */ 5288 bjtmp = bj + bi[i]; 5289 nzL = bi[i+1] - bi[i]; 5290 for(k=0;k < nzL;k++) { 5291 row = bjtmp[k]; 5292 pc = rtmp + bs2*row; 5293 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5294 if (flg) { 5295 pv = b->a + bs2*bdiag[row]; 5296 /* ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr); */ 5297 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); 5298 5299 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5300 pv = b->a + bs2*(bdiag[row+1]+1); 5301 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5302 for (j=0; j<nz; j++) { 5303 v = rtmp + bs2*pj[j]; 5304 /* ierr = Kernel_A_gets_A_minus_B_times_C_15(v,pc,pv);CHKERRQ(ierr); */ 5305 Kernel_A_gets_A_minus_B_times_C(bs,v,pc,pv); 5306 pv += bs2; 5307 } 5308 ierr = PetscLogFlops(2*bs2*bs*nz+2*bs2*bs-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5309 } 5310 } 5311 5312 /* finished row so stick it into b->a */ 5313 /* L part */ 5314 pv = b->a + bs2*bi[i] ; 5315 pj = b->j + bi[i] ; 5316 nz = bi[i+1] - bi[i]; 5317 for (j=0; j<nz; j++) { 5318 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5319 } 5320 5321 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5322 pv = b->a + bs2*bdiag[i]; 5323 pj = b->j + bdiag[i]; 5324 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5325 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5326 /*ierr = Kernel_A_gets_inverse_A_7(pv,shift);CHKERRQ(ierr); */ 5327 5328 /* U part */ 5329 pv = b->a + bs2*(bdiag[i+1]+1); 5330 pj = b->j + bdiag[i+1]+1; 5331 nz = bdiag[i] - bdiag[i+1] - 1; 5332 for (j=0; j<nz; j++){ 5333 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5334 } 5335 } 5336 5337 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5338 ierr = PetscFree2(v_work,v_pivots);CHKERRQ(ierr); 5339 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5340 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5341 C->ops->solve = MatSolve_SeqBAIJ_15; 5342 C->ops->solvetranspose = 0; 5343 C->assembled = PETSC_TRUE; 5344 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5345 PetscFunctionReturn(0); 5346 } 5347 5348 #undef __FUNCT__ 5349 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 5350 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 5351 { 5352 Mat C=B; 5353 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5354 IS isrow = b->row,isicol = b->icol; 5355 PetscErrorCode ierr; 5356 const PetscInt *r,*ic,*ics; 5357 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5358 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5359 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5360 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5361 MatScalar *v_work; 5362 PetscTruth col_identity,row_identity,both_identity; 5363 5364 PetscFunctionBegin; 5365 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5366 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5367 5368 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5369 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5370 ics = ic; 5371 5372 /* generate work space needed by dense LU factorization */ 5373 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5374 5375 for (i=0; i<n; i++){ 5376 /* zero rtmp */ 5377 /* L part */ 5378 nz = bi[i+1] - bi[i]; 5379 bjtmp = bj + bi[i]; 5380 for (j=0; j<nz; j++){ 5381 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5382 } 5383 5384 /* U part */ 5385 nz = bdiag[i] - bdiag[i+1]; 5386 bjtmp = bj + bdiag[i+1]+1; 5387 for (j=0; j<nz; j++){ 5388 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5389 } 5390 5391 /* load in initial (unfactored row) */ 5392 nz = ai[r[i]+1] - ai[r[i]]; 5393 ajtmp = aj + ai[r[i]]; 5394 v = aa + bs2*ai[r[i]]; 5395 for (j=0; j<nz; j++) { 5396 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5397 } 5398 5399 /* elimination */ 5400 bjtmp = bj + bi[i]; 5401 nzL = bi[i+1] - bi[i]; 5402 for(k=0;k < nzL;k++) { 5403 row = bjtmp[k]; 5404 pc = rtmp + bs2*row; 5405 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5406 if (flg) { 5407 pv = b->a + bs2*bdiag[row]; 5408 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5409 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5410 pv = b->a + bs2*(bdiag[row+1]+1); 5411 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5412 for (j=0; j<nz; j++) { 5413 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5414 } 5415 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5416 } 5417 } 5418 5419 /* finished row so stick it into b->a */ 5420 /* L part */ 5421 pv = b->a + bs2*bi[i] ; 5422 pj = b->j + bi[i] ; 5423 nz = bi[i+1] - bi[i]; 5424 for (j=0; j<nz; j++) { 5425 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5426 } 5427 5428 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5429 pv = b->a + bs2*bdiag[i]; 5430 pj = b->j + bdiag[i]; 5431 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5432 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5433 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5434 5435 /* U part */ 5436 pv = b->a + bs2*(bdiag[i+1]+1); 5437 pj = b->j + bdiag[i+1]+1; 5438 nz = bdiag[i] - bdiag[i+1] - 1; 5439 for (j=0; j<nz; j++){ 5440 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5441 } 5442 } 5443 5444 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5445 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 5446 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5447 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5448 5449 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5450 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5451 both_identity = (PetscTruth) (row_identity && col_identity); 5452 if (both_identity){ 5453 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5454 } else { 5455 C->ops->solve = MatSolve_SeqBAIJ_N; 5456 } 5457 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5458 5459 C->assembled = PETSC_TRUE; 5460 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5461 PetscFunctionReturn(0); 5462 } 5463 5464 /* 5465 ilu(0) with natural ordering under new data structure. 5466 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 5467 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 5468 */ 5469 5470 #undef __FUNCT__ 5471 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 5472 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5473 { 5474 5475 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5476 PetscErrorCode ierr; 5477 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5478 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 5479 5480 PetscFunctionBegin; 5481 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5482 b = (Mat_SeqBAIJ*)(fact)->data; 5483 5484 /* allocate matrix arrays for new data structure */ 5485 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 5486 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5487 b->singlemalloc = PETSC_TRUE; 5488 if (!b->diag){ 5489 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5490 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5491 } 5492 bdiag = b->diag; 5493 5494 if (n > 0) { 5495 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5496 } 5497 5498 /* set bi and bj with new data structure */ 5499 bi = b->i; 5500 bj = b->j; 5501 5502 /* L part */ 5503 bi[0] = 0; 5504 for (i=0; i<n; i++){ 5505 nz = adiag[i] - ai[i]; 5506 bi[i+1] = bi[i] + nz; 5507 aj = a->j + ai[i]; 5508 for (j=0; j<nz; j++){ 5509 *bj = aj[j]; bj++; 5510 } 5511 } 5512 5513 /* U part */ 5514 bi_temp = bi[n]; 5515 bdiag[n] = bi[n]-1; 5516 for (i=n-1; i>=0; i--){ 5517 nz = ai[i+1] - adiag[i] - 1; 5518 bi_temp = bi_temp + nz + 1; 5519 aj = a->j + adiag[i] + 1; 5520 for (j=0; j<nz; j++){ 5521 *bj = aj[j]; bj++; 5522 } 5523 /* diag[i] */ 5524 *bj = i; bj++; 5525 bdiag[i] = bi_temp - 1; 5526 } 5527 PetscFunctionReturn(0); 5528 } 5529 5530 #undef __FUNCT__ 5531 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5532 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5533 { 5534 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5535 IS isicol; 5536 PetscErrorCode ierr; 5537 const PetscInt *r,*ic; 5538 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5539 PetscInt *bi,*cols,nnz,*cols_lvl; 5540 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5541 PetscInt i,levels,diagonal_fill; 5542 PetscTruth col_identity,row_identity,both_identity; 5543 PetscReal f; 5544 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5545 PetscBT lnkbt; 5546 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5547 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5548 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5549 PetscTruth missing; 5550 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5551 PetscTruth olddatastruct = PETSC_FALSE; 5552 5553 PetscFunctionBegin; 5554 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_old",&olddatastruct,PETSC_NULL);CHKERRQ(ierr); 5555 if (olddatastruct){ 5556 ierr = MatILUFactorSymbolic_SeqBAIJ_inplace(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5557 PetscFunctionReturn(0); 5558 } 5559 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5560 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5561 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5562 5563 f = info->fill; 5564 levels = (PetscInt)info->levels; 5565 diagonal_fill = (PetscInt)info->diagonal_fill; 5566 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5567 5568 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5569 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5570 both_identity = (PetscTruth) (row_identity && col_identity); 5571 5572 if (!levels && both_identity) { 5573 /* special case: ilu(0) with natural ordering */ 5574 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5575 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5576 5577 fact->factor = MAT_FACTOR_ILU; 5578 (fact)->info.factor_mallocs = 0; 5579 (fact)->info.fill_ratio_given = info->fill; 5580 (fact)->info.fill_ratio_needed = 1.0; 5581 b = (Mat_SeqBAIJ*)(fact)->data; 5582 b->row = isrow; 5583 b->col = iscol; 5584 b->icol = isicol; 5585 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5586 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5587 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5588 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5589 PetscFunctionReturn(0); 5590 } 5591 5592 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5593 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5594 5595 /* get new row pointers */ 5596 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5597 bi[0] = 0; 5598 /* bdiag is location of diagonal in factor */ 5599 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5600 bdiag[0] = 0; 5601 5602 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 5603 5604 /* create a linked list for storing column indices of the active row */ 5605 nlnk = n + 1; 5606 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5607 5608 /* initial FreeSpace size is f*(ai[n]+1) */ 5609 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5610 current_space = free_space; 5611 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5612 current_space_lvl = free_space_lvl; 5613 5614 for (i=0; i<n; i++) { 5615 nzi = 0; 5616 /* copy current row into linked list */ 5617 nnz = ai[r[i]+1] - ai[r[i]]; 5618 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5619 cols = aj + ai[r[i]]; 5620 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5621 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5622 nzi += nlnk; 5623 5624 /* make sure diagonal entry is included */ 5625 if (diagonal_fill && lnk[i] == -1) { 5626 fm = n; 5627 while (lnk[fm] < i) fm = lnk[fm]; 5628 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5629 lnk[fm] = i; 5630 lnk_lvl[i] = 0; 5631 nzi++; dcount++; 5632 } 5633 5634 /* add pivot rows into the active row */ 5635 nzbd = 0; 5636 prow = lnk[n]; 5637 while (prow < i) { 5638 nnz = bdiag[prow]; 5639 cols = bj_ptr[prow] + nnz + 1; 5640 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5641 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5642 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5643 nzi += nlnk; 5644 prow = lnk[prow]; 5645 nzbd++; 5646 } 5647 bdiag[i] = nzbd; 5648 bi[i+1] = bi[i] + nzi; 5649 5650 /* if free space is not available, make more free space */ 5651 if (current_space->local_remaining<nzi) { 5652 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5653 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5654 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5655 reallocs++; 5656 } 5657 5658 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5659 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5660 bj_ptr[i] = current_space->array; 5661 bjlvl_ptr[i] = current_space_lvl->array; 5662 5663 /* make sure the active row i has diagonal entry */ 5664 if (*(bj_ptr[i]+bdiag[i]) != i) { 5665 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5666 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5667 } 5668 5669 current_space->array += nzi; 5670 current_space->local_used += nzi; 5671 current_space->local_remaining -= nzi; 5672 current_space_lvl->array += nzi; 5673 current_space_lvl->local_used += nzi; 5674 current_space_lvl->local_remaining -= nzi; 5675 } 5676 5677 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5678 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5679 5680 /* destroy list of free space and other temporary arrays */ 5681 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 5682 5683 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5684 ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 5685 5686 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 5687 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5688 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 5689 5690 #if defined(PETSC_USE_INFO) 5691 { 5692 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 5693 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 5694 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5695 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 5696 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5697 if (diagonal_fill) { 5698 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 5699 } 5700 } 5701 #endif 5702 5703 /* put together the new matrix */ 5704 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5705 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5706 b = (Mat_SeqBAIJ*)(fact)->data; 5707 b->free_a = PETSC_TRUE; 5708 b->free_ij = PETSC_TRUE; 5709 b->singlemalloc = PETSC_FALSE; 5710 ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5711 b->j = bj; 5712 b->i = bi; 5713 b->diag = bdiag; 5714 b->free_diag = PETSC_TRUE; 5715 b->ilen = 0; 5716 b->imax = 0; 5717 b->row = isrow; 5718 b->col = iscol; 5719 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5720 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5721 b->icol = isicol; 5722 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5723 /* In b structure: Free imax, ilen, old a, old j. 5724 Allocate bdiag, solve_work, new a, new j */ 5725 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 5726 b->maxnz = b->nz = bdiag[0]+1; 5727 fact->info.factor_mallocs = reallocs; 5728 fact->info.fill_ratio_given = f; 5729 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 5730 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5731 PetscFunctionReturn(0); 5732 } 5733 5734 5735 /* 5736 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 5737 except that the data structure of Mat_SeqAIJ is slightly different. 5738 Not a good example of code reuse. 5739 */ 5740 #undef __FUNCT__ 5741 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 5742 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5743 { 5744 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5745 IS isicol; 5746 PetscErrorCode ierr; 5747 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 5748 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5749 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5750 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 5751 PetscTruth col_identity,row_identity,both_identity,flg; 5752 PetscReal f; 5753 5754 PetscFunctionBegin; 5755 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 5756 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 5757 5758 f = info->fill; 5759 levels = (PetscInt)info->levels; 5760 diagonal_fill = (PetscInt)info->diagonal_fill; 5761 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5762 5763 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5764 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5765 both_identity = (PetscTruth) (row_identity && col_identity); 5766 5767 if (!levels && both_identity) { /* special case copy the nonzero structure */ 5768 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 5769 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 5770 5771 fact->factor = MAT_FACTOR_ILU; 5772 b = (Mat_SeqBAIJ*)fact->data; 5773 b->row = isrow; 5774 b->col = iscol; 5775 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5776 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5777 b->icol = isicol; 5778 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5779 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5780 PetscFunctionReturn(0); 5781 } 5782 5783 /* general case perform the symbolic factorization */ 5784 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5785 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5786 5787 /* get new row pointers */ 5788 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 5789 ainew[0] = 0; 5790 /* don't know how many column pointers are needed so estimate */ 5791 jmax = (PetscInt)(f*ai[n] + 1); 5792 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 5793 /* ajfill is level of fill for each fill entry */ 5794 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 5795 /* fill is a linked list of nonzeros in active row */ 5796 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 5797 /* im is level for each filled value */ 5798 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 5799 /* dloc is location of diagonal in factor */ 5800 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 5801 dloc[0] = 0; 5802 for (prow=0; prow<n; prow++) { 5803 5804 /* copy prow into linked list */ 5805 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 5806 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 5807 xi = aj + ai[r[prow]]; 5808 fill[n] = n; 5809 fill[prow] = -1; /* marker for diagonal entry */ 5810 while (nz--) { 5811 fm = n; 5812 idx = ic[*xi++]; 5813 do { 5814 m = fm; 5815 fm = fill[m]; 5816 } while (fm < idx); 5817 fill[m] = idx; 5818 fill[idx] = fm; 5819 im[idx] = 0; 5820 } 5821 5822 /* make sure diagonal entry is included */ 5823 if (diagonal_fill && fill[prow] == -1) { 5824 fm = n; 5825 while (fill[fm] < prow) fm = fill[fm]; 5826 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5827 fill[fm] = prow; 5828 im[prow] = 0; 5829 nzf++; 5830 dcount++; 5831 } 5832 5833 nzi = 0; 5834 row = fill[n]; 5835 while (row < prow) { 5836 incrlev = im[row] + 1; 5837 nz = dloc[row]; 5838 xi = ajnew + ainew[row] + nz + 1; 5839 flev = ajfill + ainew[row] + nz + 1; 5840 nnz = ainew[row+1] - ainew[row] - nz - 1; 5841 fm = row; 5842 while (nnz-- > 0) { 5843 idx = *xi++; 5844 if (*flev + incrlev > levels) { 5845 flev++; 5846 continue; 5847 } 5848 do { 5849 m = fm; 5850 fm = fill[m]; 5851 } while (fm < idx); 5852 if (fm != idx) { 5853 im[idx] = *flev + incrlev; 5854 fill[m] = idx; 5855 fill[idx] = fm; 5856 fm = idx; 5857 nzf++; 5858 } else { 5859 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 5860 } 5861 flev++; 5862 } 5863 row = fill[row]; 5864 nzi++; 5865 } 5866 /* copy new filled row into permanent storage */ 5867 ainew[prow+1] = ainew[prow] + nzf; 5868 if (ainew[prow+1] > jmax) { 5869 5870 /* estimate how much additional space we will need */ 5871 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5872 /* just double the memory each time */ 5873 PetscInt maxadd = jmax; 5874 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 5875 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 5876 jmax += maxadd; 5877 5878 /* allocate a longer ajnew and ajfill */ 5879 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5880 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5881 ierr = PetscFree(ajnew);CHKERRQ(ierr); 5882 ajnew = xitmp; 5883 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5884 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5885 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5886 ajfill = xitmp; 5887 reallocate++; /* count how many reallocations are needed */ 5888 } 5889 xitmp = ajnew + ainew[prow]; 5890 flev = ajfill + ainew[prow]; 5891 dloc[prow] = nzi; 5892 fm = fill[n]; 5893 while (nzf--) { 5894 *xitmp++ = fm; 5895 *flev++ = im[fm]; 5896 fm = fill[fm]; 5897 } 5898 /* make sure row has diagonal entry */ 5899 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 5900 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5901 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5902 } 5903 } 5904 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5905 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5906 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5907 ierr = PetscFree(fill);CHKERRQ(ierr); 5908 ierr = PetscFree(im);CHKERRQ(ierr); 5909 5910 #if defined(PETSC_USE_INFO) 5911 { 5912 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5913 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5914 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5915 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5916 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5917 if (diagonal_fill) { 5918 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5919 } 5920 } 5921 #endif 5922 5923 /* put together the new matrix */ 5924 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5925 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5926 b = (Mat_SeqBAIJ*)fact->data; 5927 b->free_a = PETSC_TRUE; 5928 b->free_ij = PETSC_TRUE; 5929 b->singlemalloc = PETSC_FALSE; 5930 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5931 b->j = ajnew; 5932 b->i = ainew; 5933 for (i=0; i<n; i++) dloc[i] += ainew[i]; 5934 b->diag = dloc; 5935 b->free_diag = PETSC_TRUE; 5936 b->ilen = 0; 5937 b->imax = 0; 5938 b->row = isrow; 5939 b->col = iscol; 5940 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5941 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5942 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5943 b->icol = isicol; 5944 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5945 /* In b structure: Free imax, ilen, old a, old j. 5946 Allocate dloc, solve_work, new a, new j */ 5947 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 5948 b->maxnz = b->nz = ainew[n]; 5949 5950 fact->info.factor_mallocs = reallocate; 5951 fact->info.fill_ratio_given = f; 5952 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 5953 5954 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 5955 PetscFunctionReturn(0); 5956 } 5957 5958 #undef __FUNCT__ 5959 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5960 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 5961 { 5962 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 5963 /* int i,*AJ=a->j,nz=a->nz; */ 5964 PetscFunctionBegin; 5965 /* Undo Column scaling */ 5966 /* while (nz--) { */ 5967 /* AJ[i] = AJ[i]/4; */ 5968 /* } */ 5969 /* This should really invoke a push/pop logic, but we don't have that yet. */ 5970 A->ops->setunfactored = PETSC_NULL; 5971 PetscFunctionReturn(0); 5972 } 5973 5974 #undef __FUNCT__ 5975 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5976 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 5977 { 5978 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5979 PetscInt *AJ=a->j,nz=a->nz; 5980 unsigned short *aj=(unsigned short *)AJ; 5981 PetscFunctionBegin; 5982 /* Is this really necessary? */ 5983 while (nz--) { 5984 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 5985 } 5986 A->ops->setunfactored = PETSC_NULL; 5987 PetscFunctionReturn(0); 5988 } 5989 5990 5991