1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 7 #include "../src/mat/impls/baij/seq/baij.h" 8 #include "../src/mat/blockinvert.h" 9 #include "petscbt.h" 10 #include "../src/mat/utils/freespace.h" 11 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 14 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 15 { 16 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 17 PetscErrorCode ierr; 18 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 19 PetscInt *diag = a->diag; 20 MatScalar *aa=a->a,*v; 21 PetscScalar s1,*x,*b; 22 23 PetscFunctionBegin; 24 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 25 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 26 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 27 28 /* forward solve the U^T */ 29 for (i=0; i<n; i++) { 30 31 v = aa + diag[i]; 32 /* multiply by the inverse of the block diagonal */ 33 s1 = (*v++)*x[i]; 34 vi = aj + diag[i] + 1; 35 nz = ai[i+1] - diag[i] - 1; 36 while (nz--) { 37 x[*vi++] -= (*v++)*s1; 38 } 39 x[i] = s1; 40 } 41 /* backward solve the L^T */ 42 for (i=n-1; i>=0; i--){ 43 v = aa + diag[i] - 1; 44 vi = aj + diag[i] - 1; 45 nz = diag[i] - ai[i]; 46 s1 = x[i]; 47 while (nz--) { 48 x[*vi--] -= (*v--)*s1; 49 } 50 } 51 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 52 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 53 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 54 PetscFunctionReturn(0); 55 } 56 57 #undef __FUNCT__ 58 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 59 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 60 { 61 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 62 PetscErrorCode ierr; 63 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 64 PetscInt *diag = a->diag,oidx; 65 MatScalar *aa=a->a,*v; 66 PetscScalar s1,s2,x1,x2; 67 PetscScalar *x,*b; 68 69 PetscFunctionBegin; 70 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 71 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 72 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 73 74 /* forward solve the U^T */ 75 idx = 0; 76 for (i=0; i<n; i++) { 77 78 v = aa + 4*diag[i]; 79 /* multiply by the inverse of the block diagonal */ 80 x1 = x[idx]; x2 = x[1+idx]; 81 s1 = v[0]*x1 + v[1]*x2; 82 s2 = v[2]*x1 + v[3]*x2; 83 v += 4; 84 85 vi = aj + diag[i] + 1; 86 nz = ai[i+1] - diag[i] - 1; 87 while (nz--) { 88 oidx = 2*(*vi++); 89 x[oidx] -= v[0]*s1 + v[1]*s2; 90 x[oidx+1] -= v[2]*s1 + v[3]*s2; 91 v += 4; 92 } 93 x[idx] = s1;x[1+idx] = s2; 94 idx += 2; 95 } 96 /* backward solve the L^T */ 97 for (i=n-1; i>=0; i--){ 98 v = aa + 4*diag[i] - 4; 99 vi = aj + diag[i] - 1; 100 nz = diag[i] - ai[i]; 101 idt = 2*i; 102 s1 = x[idt]; s2 = x[1+idt]; 103 while (nz--) { 104 idx = 2*(*vi--); 105 x[idx] -= v[0]*s1 + v[1]*s2; 106 x[idx+1] -= v[2]*s1 + v[3]*s2; 107 v -= 4; 108 } 109 } 110 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 111 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 112 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 113 PetscFunctionReturn(0); 114 } 115 116 #undef __FUNCT__ 117 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 118 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 119 { 120 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 121 PetscErrorCode ierr; 122 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 123 PetscInt nz,idx,idt,j,i,oidx; 124 PetscInt bs=A->rmap->bs,bs2=a->bs2; 125 MatScalar *aa=a->a,*v; 126 PetscScalar s1,s2,x1,x2; 127 PetscScalar *x,*b; 128 129 PetscFunctionBegin; 130 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 131 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 132 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 133 134 /* forward solve the U^T */ 135 idx = 0; 136 for (i=0; i<n; i++) { 137 v = aa + bs2*diag[i]; 138 /* multiply by the inverse of the block diagonal */ 139 x1 = x[idx]; x2 = x[1+idx]; 140 s1 = v[0]*x1 + v[1]*x2; 141 s2 = v[2]*x1 + v[3]*x2; 142 v -= bs2; 143 144 vi = aj + diag[i] - 1; 145 nz = diag[i] - diag[i+1] - 1; 146 for(j=0;j>-nz;j--){ 147 oidx = bs*vi[j]; 148 x[oidx] -= v[0]*s1 + v[1]*s2; 149 x[oidx+1] -= v[2]*s1 + v[3]*s2; 150 v -= bs2; 151 } 152 x[idx] = s1;x[1+idx] = s2; 153 idx += bs; 154 } 155 /* backward solve the L^T */ 156 for (i=n-1; i>=0; i--){ 157 v = aa + bs2*ai[i]; 158 vi = aj + ai[i]; 159 nz = ai[i+1] - ai[i]; 160 idt = bs*i; 161 s1 = x[idt]; s2 = x[1+idt]; 162 for(j=0;j<nz;j++){ 163 idx = bs*vi[j]; 164 x[idx] -= v[0]*s1 + v[1]*s2; 165 x[idx+1] -= v[2]*s1 + v[3]*s2; 166 v += bs2; 167 } 168 } 169 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 170 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 171 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 172 PetscFunctionReturn(0); 173 } 174 175 #undef __FUNCT__ 176 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 177 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 178 { 179 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 180 PetscErrorCode ierr; 181 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 182 PetscInt *diag = a->diag,oidx; 183 MatScalar *aa=a->a,*v; 184 PetscScalar s1,s2,s3,x1,x2,x3; 185 PetscScalar *x,*b; 186 187 PetscFunctionBegin; 188 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 189 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 190 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 191 192 /* forward solve the U^T */ 193 idx = 0; 194 for (i=0; i<n; i++) { 195 196 v = aa + 9*diag[i]; 197 /* multiply by the inverse of the block diagonal */ 198 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 199 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 200 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 201 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 202 v += 9; 203 204 vi = aj + diag[i] + 1; 205 nz = ai[i+1] - diag[i] - 1; 206 while (nz--) { 207 oidx = 3*(*vi++); 208 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 209 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 210 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 211 v += 9; 212 } 213 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 214 idx += 3; 215 } 216 /* backward solve the L^T */ 217 for (i=n-1; i>=0; i--){ 218 v = aa + 9*diag[i] - 9; 219 vi = aj + diag[i] - 1; 220 nz = diag[i] - ai[i]; 221 idt = 3*i; 222 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 223 while (nz--) { 224 idx = 3*(*vi--); 225 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 226 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 227 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 228 v -= 9; 229 } 230 } 231 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 232 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 233 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 234 PetscFunctionReturn(0); 235 } 236 237 #undef __FUNCT__ 238 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 239 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 240 { 241 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 242 PetscErrorCode ierr; 243 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 244 PetscInt nz,idx,idt,j,i,oidx; 245 PetscInt bs=A->rmap->bs,bs2=a->bs2; 246 MatScalar *aa=a->a,*v; 247 PetscScalar s1,s2,s3,x1,x2,x3; 248 PetscScalar *x,*b; 249 250 PetscFunctionBegin; 251 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 252 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 253 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 254 255 /* forward solve the U^T */ 256 idx = 0; 257 for (i=0; i<n; i++) { 258 v = aa + bs2*diag[i]; 259 /* multiply by the inverse of the block diagonal */ 260 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 261 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 262 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 263 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 264 v -= bs2; 265 266 vi = aj + diag[i] - 1; 267 nz = diag[i] - diag[i+1] - 1; 268 for(j=0;j>-nz;j--){ 269 oidx = bs*vi[j]; 270 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 271 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 272 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 273 v -= bs2; 274 } 275 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 276 idx += bs; 277 } 278 /* backward solve the L^T */ 279 for (i=n-1; i>=0; i--){ 280 v = aa + bs2*ai[i]; 281 vi = aj + ai[i]; 282 nz = ai[i+1] - ai[i]; 283 idt = bs*i; 284 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 285 for(j=0;j<nz;j++){ 286 idx = bs*vi[j]; 287 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 288 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 289 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 290 v += bs2; 291 } 292 } 293 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 294 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 295 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 296 PetscFunctionReturn(0); 297 } 298 299 #undef __FUNCT__ 300 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 301 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 302 { 303 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 304 PetscErrorCode ierr; 305 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 306 PetscInt *diag = a->diag,oidx; 307 MatScalar *aa=a->a,*v; 308 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 309 PetscScalar *x,*b; 310 311 PetscFunctionBegin; 312 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 313 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 314 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 315 316 /* forward solve the U^T */ 317 idx = 0; 318 for (i=0; i<n; i++) { 319 320 v = aa + 16*diag[i]; 321 /* multiply by the inverse of the block diagonal */ 322 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 323 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 324 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 325 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 326 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 327 v += 16; 328 329 vi = aj + diag[i] + 1; 330 nz = ai[i+1] - diag[i] - 1; 331 while (nz--) { 332 oidx = 4*(*vi++); 333 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 334 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 335 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 336 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 337 v += 16; 338 } 339 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 340 idx += 4; 341 } 342 /* backward solve the L^T */ 343 for (i=n-1; i>=0; i--){ 344 v = aa + 16*diag[i] - 16; 345 vi = aj + diag[i] - 1; 346 nz = diag[i] - ai[i]; 347 idt = 4*i; 348 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 349 while (nz--) { 350 idx = 4*(*vi--); 351 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 352 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 353 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 354 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 355 v -= 16; 356 } 357 } 358 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 359 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 360 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 361 PetscFunctionReturn(0); 362 } 363 364 #undef __FUNCT__ 365 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 366 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 367 { 368 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 369 PetscErrorCode ierr; 370 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 371 PetscInt nz,idx,idt,j,i,oidx; 372 PetscInt bs=A->rmap->bs,bs2=a->bs2; 373 MatScalar *aa=a->a,*v; 374 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 375 PetscScalar *x,*b; 376 377 PetscFunctionBegin; 378 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 379 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 380 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 381 382 /* forward solve the U^T */ 383 idx = 0; 384 for (i=0; i<n; i++) { 385 v = aa + bs2*diag[i]; 386 /* multiply by the inverse of the block diagonal */ 387 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 388 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 389 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 390 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 391 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 392 v -= bs2; 393 394 vi = aj + diag[i] - 1; 395 nz = diag[i] - diag[i+1] - 1; 396 for(j=0;j>-nz;j--){ 397 oidx = bs*vi[j]; 398 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 399 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 400 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 401 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 402 v -= bs2; 403 } 404 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 405 idx += bs; 406 } 407 /* backward solve the L^T */ 408 for (i=n-1; i>=0; i--){ 409 v = aa + bs2*ai[i]; 410 vi = aj + ai[i]; 411 nz = ai[i+1] - ai[i]; 412 idt = bs*i; 413 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 414 for(j=0;j<nz;j++){ 415 idx = bs*vi[j]; 416 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 417 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 418 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 419 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 420 v += bs2; 421 } 422 } 423 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 424 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 425 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 426 PetscFunctionReturn(0); 427 } 428 429 #undef __FUNCT__ 430 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 431 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 432 { 433 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 434 PetscErrorCode ierr; 435 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 436 PetscInt *diag = a->diag,oidx; 437 MatScalar *aa=a->a,*v; 438 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 439 PetscScalar *x,*b; 440 441 PetscFunctionBegin; 442 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 443 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 444 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 445 446 /* forward solve the U^T */ 447 idx = 0; 448 for (i=0; i<n; i++) { 449 450 v = aa + 25*diag[i]; 451 /* multiply by the inverse of the block diagonal */ 452 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 453 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 454 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 455 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 456 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 457 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 458 v += 25; 459 460 vi = aj + diag[i] + 1; 461 nz = ai[i+1] - diag[i] - 1; 462 while (nz--) { 463 oidx = 5*(*vi++); 464 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 465 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 466 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 467 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 468 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 469 v += 25; 470 } 471 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 472 idx += 5; 473 } 474 /* backward solve the L^T */ 475 for (i=n-1; i>=0; i--){ 476 v = aa + 25*diag[i] - 25; 477 vi = aj + diag[i] - 1; 478 nz = diag[i] - ai[i]; 479 idt = 5*i; 480 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 481 while (nz--) { 482 idx = 5*(*vi--); 483 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 484 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 485 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 486 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 487 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 488 v -= 25; 489 } 490 } 491 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 492 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 493 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 494 PetscFunctionReturn(0); 495 } 496 497 #undef __FUNCT__ 498 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 499 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 500 { 501 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 502 PetscErrorCode ierr; 503 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 504 PetscInt nz,idx,idt,j,i,oidx; 505 PetscInt bs=A->rmap->bs,bs2=a->bs2; 506 MatScalar *aa=a->a,*v; 507 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 508 PetscScalar *x,*b; 509 510 PetscFunctionBegin; 511 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 512 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 513 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 514 515 /* forward solve the U^T */ 516 idx = 0; 517 for (i=0; i<n; i++) { 518 v = aa + bs2*diag[i]; 519 /* multiply by the inverse of the block diagonal */ 520 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 521 x5 = x[4+idx]; 522 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 523 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 524 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 525 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 526 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 527 v -= bs2; 528 529 vi = aj + diag[i] - 1; 530 nz = diag[i] - diag[i+1] - 1; 531 for(j=0;j>-nz;j--){ 532 oidx = bs*vi[j]; 533 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 534 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 535 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 536 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 537 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 538 v -= bs2; 539 } 540 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 541 idx += bs; 542 } 543 /* backward solve the L^T */ 544 for (i=n-1; i>=0; i--){ 545 v = aa + bs2*ai[i]; 546 vi = aj + ai[i]; 547 nz = ai[i+1] - ai[i]; 548 idt = bs*i; 549 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 550 for(j=0;j<nz;j++){ 551 idx = bs*vi[j]; 552 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 553 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 554 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 555 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 556 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 557 v += bs2; 558 } 559 } 560 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 561 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 562 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 563 PetscFunctionReturn(0); 564 } 565 566 #undef __FUNCT__ 567 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 568 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 569 { 570 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 571 PetscErrorCode ierr; 572 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 573 PetscInt *diag = a->diag,oidx; 574 MatScalar *aa=a->a,*v; 575 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 576 PetscScalar *x,*b; 577 578 PetscFunctionBegin; 579 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 580 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 581 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 582 583 /* forward solve the U^T */ 584 idx = 0; 585 for (i=0; i<n; i++) { 586 587 v = aa + 36*diag[i]; 588 /* multiply by the inverse of the block diagonal */ 589 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 590 x6 = x[5+idx]; 591 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 592 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 593 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 594 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 595 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 596 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 597 v += 36; 598 599 vi = aj + diag[i] + 1; 600 nz = ai[i+1] - diag[i] - 1; 601 while (nz--) { 602 oidx = 6*(*vi++); 603 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 604 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 605 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 606 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 607 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 608 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 609 v += 36; 610 } 611 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 612 x[5+idx] = s6; 613 idx += 6; 614 } 615 /* backward solve the L^T */ 616 for (i=n-1; i>=0; i--){ 617 v = aa + 36*diag[i] - 36; 618 vi = aj + diag[i] - 1; 619 nz = diag[i] - ai[i]; 620 idt = 6*i; 621 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 622 s6 = x[5+idt]; 623 while (nz--) { 624 idx = 6*(*vi--); 625 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 626 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 627 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 628 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 629 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 630 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 631 v -= 36; 632 } 633 } 634 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 635 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 636 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 637 PetscFunctionReturn(0); 638 } 639 640 #undef __FUNCT__ 641 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 642 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 643 { 644 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 645 PetscErrorCode ierr; 646 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 647 PetscInt nz,idx,idt,j,i,oidx; 648 PetscInt bs=A->rmap->bs,bs2=a->bs2; 649 MatScalar *aa=a->a,*v; 650 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 651 PetscScalar *x,*b; 652 653 PetscFunctionBegin; 654 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 655 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 656 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 657 658 /* forward solve the U^T */ 659 idx = 0; 660 for (i=0; i<n; i++) { 661 v = aa + bs2*diag[i]; 662 /* multiply by the inverse of the block diagonal */ 663 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 664 x5 = x[4+idx]; x6 = x[5+idx]; 665 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 666 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 667 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 668 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 669 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 670 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 671 v -= bs2; 672 673 vi = aj + diag[i] - 1; 674 nz = diag[i] - diag[i+1] - 1; 675 for(j=0;j>-nz;j--){ 676 oidx = bs*vi[j]; 677 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 678 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 679 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 680 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 681 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 682 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 683 v -= bs2; 684 } 685 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 686 x[5+idx] = s6; 687 idx += bs; 688 } 689 /* backward solve the L^T */ 690 for (i=n-1; i>=0; i--){ 691 v = aa + bs2*ai[i]; 692 vi = aj + ai[i]; 693 nz = ai[i+1] - ai[i]; 694 idt = bs*i; 695 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 696 s6 = x[5+idt]; 697 for(j=0;j<nz;j++){ 698 idx = bs*vi[j]; 699 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 700 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 701 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 702 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 703 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 704 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 705 v += bs2; 706 } 707 } 708 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 709 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 710 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 711 PetscFunctionReturn(0); 712 } 713 714 #undef __FUNCT__ 715 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 716 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 717 { 718 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 719 PetscErrorCode ierr; 720 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 721 PetscInt *diag = a->diag,oidx; 722 MatScalar *aa=a->a,*v; 723 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 724 PetscScalar *x,*b; 725 726 PetscFunctionBegin; 727 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 728 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 729 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 730 731 /* forward solve the U^T */ 732 idx = 0; 733 for (i=0; i<n; i++) { 734 735 v = aa + 49*diag[i]; 736 /* multiply by the inverse of the block diagonal */ 737 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 738 x6 = x[5+idx]; x7 = x[6+idx]; 739 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 740 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 741 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 742 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 743 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 744 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 745 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 746 v += 49; 747 748 vi = aj + diag[i] + 1; 749 nz = ai[i+1] - diag[i] - 1; 750 while (nz--) { 751 oidx = 7*(*vi++); 752 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 753 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 754 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 755 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 756 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 757 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 758 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 759 v += 49; 760 } 761 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 762 x[5+idx] = s6;x[6+idx] = s7; 763 idx += 7; 764 } 765 /* backward solve the L^T */ 766 for (i=n-1; i>=0; i--){ 767 v = aa + 49*diag[i] - 49; 768 vi = aj + diag[i] - 1; 769 nz = diag[i] - ai[i]; 770 idt = 7*i; 771 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 772 s6 = x[5+idt];s7 = x[6+idt]; 773 while (nz--) { 774 idx = 7*(*vi--); 775 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 776 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 777 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 778 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 779 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 780 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 781 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 782 v -= 49; 783 } 784 } 785 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 786 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 787 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 788 PetscFunctionReturn(0); 789 } 790 #undef __FUNCT__ 791 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 792 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 793 { 794 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 795 PetscErrorCode ierr; 796 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 797 PetscInt nz,idx,idt,j,i,oidx; 798 PetscInt bs=A->rmap->bs,bs2=a->bs2; 799 MatScalar *aa=a->a,*v; 800 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 801 PetscScalar *x,*b; 802 803 PetscFunctionBegin; 804 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 805 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 806 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 807 808 /* forward solve the U^T */ 809 idx = 0; 810 for (i=0; i<n; i++) { 811 v = aa + bs2*diag[i]; 812 /* multiply by the inverse of the block diagonal */ 813 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 814 x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 815 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 816 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 817 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 818 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 819 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 820 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 821 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 822 v -= bs2; 823 vi = aj + diag[i] - 1; 824 nz = diag[i] - diag[i+1] - 1; 825 for(j=0;j>-nz;j--){ 826 oidx = bs*vi[j]; 827 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 828 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 829 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 830 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 831 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 832 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 833 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 834 v -= bs2; 835 } 836 x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 837 x[5+idx] = s6; x[6+idx] = s7; 838 idx += bs; 839 } 840 /* backward solve the L^T */ 841 for (i=n-1; i>=0; i--){ 842 v = aa + bs2*ai[i]; 843 vi = aj + ai[i]; 844 nz = ai[i+1] - ai[i]; 845 idt = bs*i; 846 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 847 s6 = x[5+idt]; s7 = x[6+idt]; 848 for(j=0;j<nz;j++){ 849 idx = bs*vi[j]; 850 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 851 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 852 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 853 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 854 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 855 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 856 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 857 v += bs2; 858 } 859 } 860 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 861 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 862 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 863 PetscFunctionReturn(0); 864 } 865 866 /*---------------------------------------------------------------------------------------------*/ 867 #undef __FUNCT__ 868 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 869 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 870 { 871 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 872 IS iscol=a->col,isrow=a->row; 873 PetscErrorCode ierr; 874 const PetscInt *r,*c,*rout,*cout; 875 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 876 PetscInt *diag = a->diag; 877 MatScalar *aa=a->a,*v; 878 PetscScalar s1,*x,*b,*t; 879 880 PetscFunctionBegin; 881 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 882 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 883 t = a->solve_work; 884 885 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 886 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 887 888 /* copy the b into temp work space according to permutation */ 889 for (i=0; i<n; i++) { 890 t[i] = b[c[i]]; 891 } 892 893 /* forward solve the U^T */ 894 for (i=0; i<n; i++) { 895 896 v = aa + diag[i]; 897 /* multiply by the inverse of the block diagonal */ 898 s1 = (*v++)*t[i]; 899 vi = aj + diag[i] + 1; 900 nz = ai[i+1] - diag[i] - 1; 901 while (nz--) { 902 t[*vi++] -= (*v++)*s1; 903 } 904 t[i] = s1; 905 } 906 /* backward solve the L^T */ 907 for (i=n-1; i>=0; i--){ 908 v = aa + diag[i] - 1; 909 vi = aj + diag[i] - 1; 910 nz = diag[i] - ai[i]; 911 s1 = t[i]; 912 while (nz--) { 913 t[*vi--] -= (*v--)*s1; 914 } 915 } 916 917 /* copy t into x according to permutation */ 918 for (i=0; i<n; i++) { 919 x[r[i]] = t[i]; 920 } 921 922 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 923 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 924 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 925 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 926 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 927 PetscFunctionReturn(0); 928 } 929 930 #undef __FUNCT__ 931 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 932 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 933 { 934 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 935 IS iscol=a->col,isrow=a->row; 936 PetscErrorCode ierr; 937 const PetscInt *r,*c,*rout,*cout; 938 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 939 PetscInt *diag = a->diag,ii,ic,ir,oidx; 940 MatScalar *aa=a->a,*v; 941 PetscScalar s1,s2,x1,x2; 942 PetscScalar *x,*b,*t; 943 944 PetscFunctionBegin; 945 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 946 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 947 t = a->solve_work; 948 949 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 950 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 951 952 /* copy the b into temp work space according to permutation */ 953 ii = 0; 954 for (i=0; i<n; i++) { 955 ic = 2*c[i]; 956 t[ii] = b[ic]; 957 t[ii+1] = b[ic+1]; 958 ii += 2; 959 } 960 961 /* forward solve the U^T */ 962 idx = 0; 963 for (i=0; i<n; i++) { 964 965 v = aa + 4*diag[i]; 966 /* multiply by the inverse of the block diagonal */ 967 x1 = t[idx]; x2 = t[1+idx]; 968 s1 = v[0]*x1 + v[1]*x2; 969 s2 = v[2]*x1 + v[3]*x2; 970 v += 4; 971 972 vi = aj + diag[i] + 1; 973 nz = ai[i+1] - diag[i] - 1; 974 while (nz--) { 975 oidx = 2*(*vi++); 976 t[oidx] -= v[0]*s1 + v[1]*s2; 977 t[oidx+1] -= v[2]*s1 + v[3]*s2; 978 v += 4; 979 } 980 t[idx] = s1;t[1+idx] = s2; 981 idx += 2; 982 } 983 /* backward solve the L^T */ 984 for (i=n-1; i>=0; i--){ 985 v = aa + 4*diag[i] - 4; 986 vi = aj + diag[i] - 1; 987 nz = diag[i] - ai[i]; 988 idt = 2*i; 989 s1 = t[idt]; s2 = t[1+idt]; 990 while (nz--) { 991 idx = 2*(*vi--); 992 t[idx] -= v[0]*s1 + v[1]*s2; 993 t[idx+1] -= v[2]*s1 + v[3]*s2; 994 v -= 4; 995 } 996 } 997 998 /* copy t into x according to permutation */ 999 ii = 0; 1000 for (i=0; i<n; i++) { 1001 ir = 2*r[i]; 1002 x[ir] = t[ii]; 1003 x[ir+1] = t[ii+1]; 1004 ii += 2; 1005 } 1006 1007 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1008 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1009 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1010 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1011 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1012 PetscFunctionReturn(0); 1013 } 1014 1015 #undef __FUNCT__ 1016 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 1017 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1018 { 1019 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1020 PetscErrorCode ierr; 1021 IS iscol=a->col,isrow=a->row; 1022 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1023 const PetscInt *r,*c,*rout,*cout; 1024 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1025 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1026 MatScalar *aa=a->a,*v; 1027 PetscScalar s1,s2,x1,x2; 1028 PetscScalar *x,*b,*t; 1029 1030 PetscFunctionBegin; 1031 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1032 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1033 t = a->solve_work; 1034 1035 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1036 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1037 1038 /* copy b into temp work space according to permutation */ 1039 for(i=0;i<n;i++){ 1040 ii = bs*i; ic = bs*c[i]; 1041 t[ii] = b[ic]; t[ii+1] = b[ic+1]; 1042 } 1043 1044 /* forward solve the U^T */ 1045 idx = 0; 1046 for (i=0; i<n; i++) { 1047 v = aa + bs2*diag[i]; 1048 /* multiply by the inverse of the block diagonal */ 1049 x1 = t[idx]; x2 = t[1+idx]; 1050 s1 = v[0]*x1 + v[1]*x2; 1051 s2 = v[2]*x1 + v[3]*x2; 1052 v -= bs2; 1053 1054 vi = aj + diag[i] - 1; 1055 nz = diag[i] - diag[i+1] - 1; 1056 for(j=0;j>-nz;j--){ 1057 oidx = bs*vi[j]; 1058 t[oidx] -= v[0]*s1 + v[1]*s2; 1059 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1060 v -= bs2; 1061 } 1062 t[idx] = s1;t[1+idx] = s2; 1063 idx += bs; 1064 } 1065 /* backward solve the L^T */ 1066 for (i=n-1; i>=0; i--){ 1067 v = aa + bs2*ai[i]; 1068 vi = aj + ai[i]; 1069 nz = ai[i+1] - ai[i]; 1070 idt = bs*i; 1071 s1 = t[idt]; s2 = t[1+idt]; 1072 for(j=0;j<nz;j++){ 1073 idx = bs*vi[j]; 1074 t[idx] -= v[0]*s1 + v[1]*s2; 1075 t[idx+1] -= v[2]*s1 + v[3]*s2; 1076 v += bs2; 1077 } 1078 } 1079 1080 /* copy t into x according to permutation */ 1081 for(i=0;i<n;i++){ 1082 ii = bs*i; ir = bs*r[i]; 1083 x[ir] = t[ii]; x[ir+1] = t[ii+1]; 1084 } 1085 1086 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1087 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1088 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1089 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1090 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1091 PetscFunctionReturn(0); 1092 } 1093 1094 #undef __FUNCT__ 1095 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 1096 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1097 { 1098 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1099 IS iscol=a->col,isrow=a->row; 1100 PetscErrorCode ierr; 1101 const PetscInt *r,*c,*rout,*cout; 1102 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1103 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1104 MatScalar *aa=a->a,*v; 1105 PetscScalar s1,s2,s3,x1,x2,x3; 1106 PetscScalar *x,*b,*t; 1107 1108 PetscFunctionBegin; 1109 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1110 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1111 t = a->solve_work; 1112 1113 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1114 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1115 1116 /* copy the b into temp work space according to permutation */ 1117 ii = 0; 1118 for (i=0; i<n; i++) { 1119 ic = 3*c[i]; 1120 t[ii] = b[ic]; 1121 t[ii+1] = b[ic+1]; 1122 t[ii+2] = b[ic+2]; 1123 ii += 3; 1124 } 1125 1126 /* forward solve the U^T */ 1127 idx = 0; 1128 for (i=0; i<n; i++) { 1129 1130 v = aa + 9*diag[i]; 1131 /* multiply by the inverse of the block diagonal */ 1132 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1133 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1134 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1135 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1136 v += 9; 1137 1138 vi = aj + diag[i] + 1; 1139 nz = ai[i+1] - diag[i] - 1; 1140 while (nz--) { 1141 oidx = 3*(*vi++); 1142 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1143 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1144 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1145 v += 9; 1146 } 1147 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1148 idx += 3; 1149 } 1150 /* backward solve the L^T */ 1151 for (i=n-1; i>=0; i--){ 1152 v = aa + 9*diag[i] - 9; 1153 vi = aj + diag[i] - 1; 1154 nz = diag[i] - ai[i]; 1155 idt = 3*i; 1156 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1157 while (nz--) { 1158 idx = 3*(*vi--); 1159 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1160 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1161 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1162 v -= 9; 1163 } 1164 } 1165 1166 /* copy t into x according to permutation */ 1167 ii = 0; 1168 for (i=0; i<n; i++) { 1169 ir = 3*r[i]; 1170 x[ir] = t[ii]; 1171 x[ir+1] = t[ii+1]; 1172 x[ir+2] = t[ii+2]; 1173 ii += 3; 1174 } 1175 1176 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1177 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1178 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1179 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1180 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1181 PetscFunctionReturn(0); 1182 } 1183 1184 #undef __FUNCT__ 1185 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 1186 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1187 { 1188 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1189 PetscErrorCode ierr; 1190 IS iscol=a->col,isrow=a->row; 1191 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1192 const PetscInt *r,*c,*rout,*cout; 1193 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1194 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1195 MatScalar *aa=a->a,*v; 1196 PetscScalar s1,s2,s3,x1,x2,x3; 1197 PetscScalar *x,*b,*t; 1198 1199 PetscFunctionBegin; 1200 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1201 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1202 t = a->solve_work; 1203 1204 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1205 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1206 1207 /* copy b into temp work space according to permutation */ 1208 for(i=0;i<n;i++){ 1209 ii = bs*i; ic = bs*c[i]; 1210 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 1211 } 1212 1213 /* forward solve the U^T */ 1214 idx = 0; 1215 for (i=0; i<n; i++) { 1216 v = aa + bs2*diag[i]; 1217 /* multiply by the inverse of the block diagonal */ 1218 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1219 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1220 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1221 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1222 v -= bs2; 1223 1224 vi = aj + diag[i] - 1; 1225 nz = diag[i] - diag[i+1] - 1; 1226 for(j=0;j>-nz;j--){ 1227 oidx = bs*vi[j]; 1228 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1229 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1230 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1231 v -= bs2; 1232 } 1233 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1234 idx += bs; 1235 } 1236 /* backward solve the L^T */ 1237 for (i=n-1; i>=0; i--){ 1238 v = aa + bs2*ai[i]; 1239 vi = aj + ai[i]; 1240 nz = ai[i+1] - ai[i]; 1241 idt = bs*i; 1242 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1243 for(j=0;j<nz;j++){ 1244 idx = bs*vi[j]; 1245 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1246 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1247 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1248 v += bs2; 1249 } 1250 } 1251 1252 /* copy t into x according to permutation */ 1253 for(i=0;i<n;i++){ 1254 ii = bs*i; ir = bs*r[i]; 1255 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 1256 } 1257 1258 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1259 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1260 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1261 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1262 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1263 PetscFunctionReturn(0); 1264 } 1265 1266 #undef __FUNCT__ 1267 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 1268 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1269 { 1270 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1271 IS iscol=a->col,isrow=a->row; 1272 PetscErrorCode ierr; 1273 const PetscInt *r,*c,*rout,*cout; 1274 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1275 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1276 MatScalar *aa=a->a,*v; 1277 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 1278 PetscScalar *x,*b,*t; 1279 1280 PetscFunctionBegin; 1281 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1282 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1283 t = a->solve_work; 1284 1285 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1286 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1287 1288 /* copy the b into temp work space according to permutation */ 1289 ii = 0; 1290 for (i=0; i<n; i++) { 1291 ic = 4*c[i]; 1292 t[ii] = b[ic]; 1293 t[ii+1] = b[ic+1]; 1294 t[ii+2] = b[ic+2]; 1295 t[ii+3] = b[ic+3]; 1296 ii += 4; 1297 } 1298 1299 /* forward solve the U^T */ 1300 idx = 0; 1301 for (i=0; i<n; i++) { 1302 1303 v = aa + 16*diag[i]; 1304 /* multiply by the inverse of the block diagonal */ 1305 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1306 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1307 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1308 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1309 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1310 v += 16; 1311 1312 vi = aj + diag[i] + 1; 1313 nz = ai[i+1] - diag[i] - 1; 1314 while (nz--) { 1315 oidx = 4*(*vi++); 1316 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1317 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1318 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1319 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1320 v += 16; 1321 } 1322 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1323 idx += 4; 1324 } 1325 /* backward solve the L^T */ 1326 for (i=n-1; i>=0; i--){ 1327 v = aa + 16*diag[i] - 16; 1328 vi = aj + diag[i] - 1; 1329 nz = diag[i] - ai[i]; 1330 idt = 4*i; 1331 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1332 while (nz--) { 1333 idx = 4*(*vi--); 1334 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1335 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1336 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1337 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1338 v -= 16; 1339 } 1340 } 1341 1342 /* copy t into x according to permutation */ 1343 ii = 0; 1344 for (i=0; i<n; i++) { 1345 ir = 4*r[i]; 1346 x[ir] = t[ii]; 1347 x[ir+1] = t[ii+1]; 1348 x[ir+2] = t[ii+2]; 1349 x[ir+3] = t[ii+3]; 1350 ii += 4; 1351 } 1352 1353 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1354 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1355 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1356 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1357 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1358 PetscFunctionReturn(0); 1359 } 1360 1361 #undef __FUNCT__ 1362 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 1363 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1364 { 1365 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1366 PetscErrorCode ierr; 1367 IS iscol=a->col,isrow=a->row; 1368 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1369 const PetscInt *r,*c,*rout,*cout; 1370 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1371 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1372 MatScalar *aa=a->a,*v; 1373 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 1374 PetscScalar *x,*b,*t; 1375 1376 PetscFunctionBegin; 1377 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1378 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1379 t = a->solve_work; 1380 1381 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1382 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1383 1384 /* copy b into temp work space according to permutation */ 1385 for(i=0;i<n;i++){ 1386 ii = bs*i; ic = bs*c[i]; 1387 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1388 } 1389 1390 /* forward solve the U^T */ 1391 idx = 0; 1392 for (i=0; i<n; i++) { 1393 v = aa + bs2*diag[i]; 1394 /* multiply by the inverse of the block diagonal */ 1395 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1396 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1397 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1398 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1399 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1400 v -= bs2; 1401 1402 vi = aj + diag[i] - 1; 1403 nz = diag[i] - diag[i+1] - 1; 1404 for(j=0;j>-nz;j--){ 1405 oidx = bs*vi[j]; 1406 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1407 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1408 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1409 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1410 v -= bs2; 1411 } 1412 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 1413 idx += bs; 1414 } 1415 /* backward solve the L^T */ 1416 for (i=n-1; i>=0; i--){ 1417 v = aa + bs2*ai[i]; 1418 vi = aj + ai[i]; 1419 nz = ai[i+1] - ai[i]; 1420 idt = bs*i; 1421 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 1422 for(j=0;j<nz;j++){ 1423 idx = bs*vi[j]; 1424 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1425 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1426 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1427 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1428 v += bs2; 1429 } 1430 } 1431 1432 /* copy t into x according to permutation */ 1433 for(i=0;i<n;i++){ 1434 ii = bs*i; ir = bs*r[i]; 1435 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1436 } 1437 1438 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1439 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1440 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1441 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1442 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1443 PetscFunctionReturn(0); 1444 } 1445 1446 #undef __FUNCT__ 1447 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 1448 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1449 { 1450 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1451 IS iscol=a->col,isrow=a->row; 1452 PetscErrorCode ierr; 1453 const PetscInt *r,*c,*rout,*cout; 1454 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1455 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1456 MatScalar *aa=a->a,*v; 1457 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1458 PetscScalar *x,*b,*t; 1459 1460 PetscFunctionBegin; 1461 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1462 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1463 t = a->solve_work; 1464 1465 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1466 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1467 1468 /* copy the b into temp work space according to permutation */ 1469 ii = 0; 1470 for (i=0; i<n; i++) { 1471 ic = 5*c[i]; 1472 t[ii] = b[ic]; 1473 t[ii+1] = b[ic+1]; 1474 t[ii+2] = b[ic+2]; 1475 t[ii+3] = b[ic+3]; 1476 t[ii+4] = b[ic+4]; 1477 ii += 5; 1478 } 1479 1480 /* forward solve the U^T */ 1481 idx = 0; 1482 for (i=0; i<n; i++) { 1483 1484 v = aa + 25*diag[i]; 1485 /* multiply by the inverse of the block diagonal */ 1486 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1487 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1488 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1489 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1490 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1491 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1492 v += 25; 1493 1494 vi = aj + diag[i] + 1; 1495 nz = ai[i+1] - diag[i] - 1; 1496 while (nz--) { 1497 oidx = 5*(*vi++); 1498 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1499 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1500 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1501 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1502 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1503 v += 25; 1504 } 1505 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1506 idx += 5; 1507 } 1508 /* backward solve the L^T */ 1509 for (i=n-1; i>=0; i--){ 1510 v = aa + 25*diag[i] - 25; 1511 vi = aj + diag[i] - 1; 1512 nz = diag[i] - ai[i]; 1513 idt = 5*i; 1514 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1515 while (nz--) { 1516 idx = 5*(*vi--); 1517 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1518 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1519 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1520 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1521 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1522 v -= 25; 1523 } 1524 } 1525 1526 /* copy t into x according to permutation */ 1527 ii = 0; 1528 for (i=0; i<n; i++) { 1529 ir = 5*r[i]; 1530 x[ir] = t[ii]; 1531 x[ir+1] = t[ii+1]; 1532 x[ir+2] = t[ii+2]; 1533 x[ir+3] = t[ii+3]; 1534 x[ir+4] = t[ii+4]; 1535 ii += 5; 1536 } 1537 1538 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1539 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1540 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1541 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1542 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1543 PetscFunctionReturn(0); 1544 } 1545 1546 #undef __FUNCT__ 1547 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 1548 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1549 { 1550 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1551 PetscErrorCode ierr; 1552 IS iscol=a->col,isrow=a->row; 1553 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1554 const PetscInt *r,*c,*rout,*cout; 1555 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1556 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1557 MatScalar *aa=a->a,*v; 1558 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1559 PetscScalar *x,*b,*t; 1560 1561 PetscFunctionBegin; 1562 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1563 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1564 t = a->solve_work; 1565 1566 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1567 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1568 1569 /* copy b into temp work space according to permutation */ 1570 for(i=0;i<n;i++){ 1571 ii = bs*i; ic = bs*c[i]; 1572 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1573 t[ii+4] = b[ic+4]; 1574 } 1575 1576 /* forward solve the U^T */ 1577 idx = 0; 1578 for (i=0; i<n; i++) { 1579 v = aa + bs2*diag[i]; 1580 /* multiply by the inverse of the block diagonal */ 1581 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1582 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1583 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1584 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1585 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1586 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1587 v -= bs2; 1588 1589 vi = aj + diag[i] - 1; 1590 nz = diag[i] - diag[i+1] - 1; 1591 for(j=0;j>-nz;j--){ 1592 oidx = bs*vi[j]; 1593 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1594 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1595 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1596 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1597 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1598 v -= bs2; 1599 } 1600 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1601 idx += bs; 1602 } 1603 /* backward solve the L^T */ 1604 for (i=n-1; i>=0; i--){ 1605 v = aa + bs2*ai[i]; 1606 vi = aj + ai[i]; 1607 nz = ai[i+1] - ai[i]; 1608 idt = bs*i; 1609 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1610 for(j=0;j<nz;j++){ 1611 idx = bs*vi[j]; 1612 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1613 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1614 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1615 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1616 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1617 v += bs2; 1618 } 1619 } 1620 1621 /* copy t into x according to permutation */ 1622 for(i=0;i<n;i++){ 1623 ii = bs*i; ir = bs*r[i]; 1624 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1625 x[ir+4] = t[ii+4]; 1626 } 1627 1628 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1629 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1630 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1631 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1632 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1633 PetscFunctionReturn(0); 1634 } 1635 1636 #undef __FUNCT__ 1637 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 1638 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1639 { 1640 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1641 IS iscol=a->col,isrow=a->row; 1642 PetscErrorCode ierr; 1643 const PetscInt *r,*c,*rout,*cout; 1644 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1645 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1646 MatScalar *aa=a->a,*v; 1647 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1648 PetscScalar *x,*b,*t; 1649 1650 PetscFunctionBegin; 1651 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1652 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1653 t = a->solve_work; 1654 1655 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1656 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1657 1658 /* copy the b into temp work space according to permutation */ 1659 ii = 0; 1660 for (i=0; i<n; i++) { 1661 ic = 6*c[i]; 1662 t[ii] = b[ic]; 1663 t[ii+1] = b[ic+1]; 1664 t[ii+2] = b[ic+2]; 1665 t[ii+3] = b[ic+3]; 1666 t[ii+4] = b[ic+4]; 1667 t[ii+5] = b[ic+5]; 1668 ii += 6; 1669 } 1670 1671 /* forward solve the U^T */ 1672 idx = 0; 1673 for (i=0; i<n; i++) { 1674 1675 v = aa + 36*diag[i]; 1676 /* multiply by the inverse of the block diagonal */ 1677 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1678 x6 = t[5+idx]; 1679 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1680 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1681 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1682 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1683 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1684 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1685 v += 36; 1686 1687 vi = aj + diag[i] + 1; 1688 nz = ai[i+1] - diag[i] - 1; 1689 while (nz--) { 1690 oidx = 6*(*vi++); 1691 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1692 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1693 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1694 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1695 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1696 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1697 v += 36; 1698 } 1699 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1700 t[5+idx] = s6; 1701 idx += 6; 1702 } 1703 /* backward solve the L^T */ 1704 for (i=n-1; i>=0; i--){ 1705 v = aa + 36*diag[i] - 36; 1706 vi = aj + diag[i] - 1; 1707 nz = diag[i] - ai[i]; 1708 idt = 6*i; 1709 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1710 s6 = t[5+idt]; 1711 while (nz--) { 1712 idx = 6*(*vi--); 1713 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1714 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1715 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1716 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1717 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1718 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1719 v -= 36; 1720 } 1721 } 1722 1723 /* copy t into x according to permutation */ 1724 ii = 0; 1725 for (i=0; i<n; i++) { 1726 ir = 6*r[i]; 1727 x[ir] = t[ii]; 1728 x[ir+1] = t[ii+1]; 1729 x[ir+2] = t[ii+2]; 1730 x[ir+3] = t[ii+3]; 1731 x[ir+4] = t[ii+4]; 1732 x[ir+5] = t[ii+5]; 1733 ii += 6; 1734 } 1735 1736 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1737 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1738 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1739 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1740 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1741 PetscFunctionReturn(0); 1742 } 1743 1744 #undef __FUNCT__ 1745 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 1746 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1747 { 1748 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1749 PetscErrorCode ierr; 1750 IS iscol=a->col,isrow=a->row; 1751 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1752 const PetscInt *r,*c,*rout,*cout; 1753 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1754 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1755 MatScalar *aa=a->a,*v; 1756 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1757 PetscScalar *x,*b,*t; 1758 1759 PetscFunctionBegin; 1760 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1761 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1762 t = a->solve_work; 1763 1764 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1765 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1766 1767 /* copy b into temp work space according to permutation */ 1768 for(i=0;i<n;i++){ 1769 ii = bs*i; ic = bs*c[i]; 1770 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1771 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 1772 } 1773 1774 /* forward solve the U^T */ 1775 idx = 0; 1776 for (i=0; i<n; i++) { 1777 v = aa + bs2*diag[i]; 1778 /* multiply by the inverse of the block diagonal */ 1779 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1780 x6 = t[5+idx]; 1781 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1782 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1783 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1784 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1785 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1786 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1787 v -= bs2; 1788 1789 vi = aj + diag[i] - 1; 1790 nz = diag[i] - diag[i+1] - 1; 1791 for(j=0;j>-nz;j--){ 1792 oidx = bs*vi[j]; 1793 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1794 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1795 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1796 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1797 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1798 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1799 v -= bs2; 1800 } 1801 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1802 t[5+idx] = s6; 1803 idx += bs; 1804 } 1805 /* backward solve the L^T */ 1806 for (i=n-1; i>=0; i--){ 1807 v = aa + bs2*ai[i]; 1808 vi = aj + ai[i]; 1809 nz = ai[i+1] - ai[i]; 1810 idt = bs*i; 1811 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1812 s6 = t[5+idt]; 1813 for(j=0;j<nz;j++){ 1814 idx = bs*vi[j]; 1815 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1816 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1817 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1818 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1819 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1820 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1821 v += bs2; 1822 } 1823 } 1824 1825 /* copy t into x according to permutation */ 1826 for(i=0;i<n;i++){ 1827 ii = bs*i; ir = bs*r[i]; 1828 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1829 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 1830 } 1831 1832 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1833 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1834 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1835 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1836 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1837 PetscFunctionReturn(0); 1838 } 1839 1840 #undef __FUNCT__ 1841 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 1842 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1843 { 1844 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1845 IS iscol=a->col,isrow=a->row; 1846 PetscErrorCode ierr; 1847 const PetscInt *r,*c,*rout,*cout; 1848 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1849 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1850 MatScalar *aa=a->a,*v; 1851 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1852 PetscScalar *x,*b,*t; 1853 1854 PetscFunctionBegin; 1855 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1856 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1857 t = a->solve_work; 1858 1859 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1860 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1861 1862 /* copy the b into temp work space according to permutation */ 1863 ii = 0; 1864 for (i=0; i<n; i++) { 1865 ic = 7*c[i]; 1866 t[ii] = b[ic]; 1867 t[ii+1] = b[ic+1]; 1868 t[ii+2] = b[ic+2]; 1869 t[ii+3] = b[ic+3]; 1870 t[ii+4] = b[ic+4]; 1871 t[ii+5] = b[ic+5]; 1872 t[ii+6] = b[ic+6]; 1873 ii += 7; 1874 } 1875 1876 /* forward solve the U^T */ 1877 idx = 0; 1878 for (i=0; i<n; i++) { 1879 1880 v = aa + 49*diag[i]; 1881 /* multiply by the inverse of the block diagonal */ 1882 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1883 x6 = t[5+idx]; x7 = t[6+idx]; 1884 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1885 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1886 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1887 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1888 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1889 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1890 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1891 v += 49; 1892 1893 vi = aj + diag[i] + 1; 1894 nz = ai[i+1] - diag[i] - 1; 1895 while (nz--) { 1896 oidx = 7*(*vi++); 1897 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1898 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1899 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1900 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1901 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1902 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1903 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1904 v += 49; 1905 } 1906 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1907 t[5+idx] = s6;t[6+idx] = s7; 1908 idx += 7; 1909 } 1910 /* backward solve the L^T */ 1911 for (i=n-1; i>=0; i--){ 1912 v = aa + 49*diag[i] - 49; 1913 vi = aj + diag[i] - 1; 1914 nz = diag[i] - ai[i]; 1915 idt = 7*i; 1916 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1917 s6 = t[5+idt];s7 = t[6+idt]; 1918 while (nz--) { 1919 idx = 7*(*vi--); 1920 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1921 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1922 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1923 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1924 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1925 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1926 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1927 v -= 49; 1928 } 1929 } 1930 1931 /* copy t into x according to permutation */ 1932 ii = 0; 1933 for (i=0; i<n; i++) { 1934 ir = 7*r[i]; 1935 x[ir] = t[ii]; 1936 x[ir+1] = t[ii+1]; 1937 x[ir+2] = t[ii+2]; 1938 x[ir+3] = t[ii+3]; 1939 x[ir+4] = t[ii+4]; 1940 x[ir+5] = t[ii+5]; 1941 x[ir+6] = t[ii+6]; 1942 ii += 7; 1943 } 1944 1945 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1946 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1947 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1948 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1949 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1950 PetscFunctionReturn(0); 1951 } 1952 #undef __FUNCT__ 1953 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1954 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1955 { 1956 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1957 PetscErrorCode ierr; 1958 IS iscol=a->col,isrow=a->row; 1959 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1960 const PetscInt *r,*c,*rout,*cout; 1961 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1962 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1963 MatScalar *aa=a->a,*v; 1964 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1965 PetscScalar *x,*b,*t; 1966 1967 PetscFunctionBegin; 1968 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1969 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1970 t = a->solve_work; 1971 1972 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1973 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1974 1975 /* copy b into temp work space according to permutation */ 1976 for(i=0;i<n;i++){ 1977 ii = bs*i; ic = bs*c[i]; 1978 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1979 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 1980 } 1981 1982 /* forward solve the U^T */ 1983 idx = 0; 1984 for (i=0; i<n; i++) { 1985 v = aa + bs2*diag[i]; 1986 /* multiply by the inverse of the block diagonal */ 1987 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1988 x6 = t[5+idx]; x7 = t[6+idx]; 1989 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1990 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1991 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1992 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1993 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1994 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1995 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1996 v -= bs2; 1997 1998 vi = aj + diag[i] - 1; 1999 nz = diag[i] - diag[i+1] - 1; 2000 for(j=0;j>-nz;j--){ 2001 oidx = bs*vi[j]; 2002 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2003 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2004 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2005 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2006 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2007 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2008 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2009 v -= bs2; 2010 } 2011 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 2012 t[5+idx] = s6; t[6+idx] = s7; 2013 idx += bs; 2014 } 2015 /* backward solve the L^T */ 2016 for (i=n-1; i>=0; i--){ 2017 v = aa + bs2*ai[i]; 2018 vi = aj + ai[i]; 2019 nz = ai[i+1] - ai[i]; 2020 idt = bs*i; 2021 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2022 s6 = t[5+idt]; s7 = t[6+idt]; 2023 for(j=0;j<nz;j++){ 2024 idx = bs*vi[j]; 2025 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2026 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2027 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2028 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2029 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2030 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2031 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2032 v += bs2; 2033 } 2034 } 2035 2036 /* copy t into x according to permutation */ 2037 for(i=0;i<n;i++){ 2038 ii = bs*i; ir = bs*r[i]; 2039 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 2040 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 2041 } 2042 2043 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2044 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2045 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2046 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2047 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2048 PetscFunctionReturn(0); 2049 } 2050 2051 /* ----------------------------------------------------------- */ 2052 #undef __FUNCT__ 2053 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 2054 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2055 { 2056 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2057 IS iscol=a->col,isrow=a->row; 2058 PetscErrorCode ierr; 2059 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2060 PetscInt i,n=a->mbs; 2061 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 2062 MatScalar *aa=a->a,*v; 2063 PetscScalar *x,*b,*s,*t,*ls; 2064 2065 PetscFunctionBegin; 2066 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2067 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2068 t = a->solve_work; 2069 2070 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2071 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2072 2073 /* forward solve the lower triangular */ 2074 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2075 for (i=1; i<n; i++) { 2076 v = aa + bs2*ai[i]; 2077 vi = aj + ai[i]; 2078 nz = a->diag[i] - ai[i]; 2079 s = t + bs*i; 2080 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2081 while (nz--) { 2082 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 2083 v += bs2; 2084 } 2085 } 2086 /* backward solve the upper triangular */ 2087 ls = a->solve_work + A->cmap->n; 2088 for (i=n-1; i>=0; i--){ 2089 v = aa + bs2*(a->diag[i] + 1); 2090 vi = aj + a->diag[i] + 1; 2091 nz = ai[i+1] - a->diag[i] - 1; 2092 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2093 while (nz--) { 2094 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 2095 v += bs2; 2096 } 2097 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2098 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2099 } 2100 2101 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2102 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2103 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2104 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2105 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2106 PetscFunctionReturn(0); 2107 } 2108 2109 /* ----------------------------------------------------------- */ 2110 #undef __FUNCT__ 2111 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 2112 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2113 { 2114 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2115 IS iscol=a->col,isrow=a->row; 2116 PetscErrorCode ierr; 2117 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2118 PetscInt i,n=a->mbs,j; 2119 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 2120 const MatScalar *aa=a->a,*v; 2121 PetscScalar *x,*t,*ls; 2122 const PetscScalar *b; 2123 PetscFunctionBegin; 2124 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2125 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2126 t = a->solve_work; 2127 2128 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2129 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2130 2131 /* copy the b into temp work space according to permutation */ 2132 for (i=0; i<n; i++) { 2133 for (j=0; j<bs; j++) { 2134 t[i*bs+j] = b[c[i]*bs+j]; 2135 } 2136 } 2137 2138 2139 /* forward solve the upper triangular transpose */ 2140 ls = a->solve_work + A->cmap->n; 2141 for (i=0; i<n; i++){ 2142 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2143 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2144 v = aa + bs2*(a->diag[i] + 1); 2145 vi = aj + a->diag[i] + 1; 2146 nz = ai[i+1] - a->diag[i] - 1; 2147 while (nz--) { 2148 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2149 v += bs2; 2150 } 2151 } 2152 2153 /* backward solve the lower triangular transpose */ 2154 for (i=n-1; i>=0; i--) { 2155 v = aa + bs2*ai[i]; 2156 vi = aj + ai[i]; 2157 nz = a->diag[i] - ai[i]; 2158 while (nz--) { 2159 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2160 v += bs2; 2161 } 2162 } 2163 2164 /* copy t into x according to permutation */ 2165 for (i=0; i<n; i++) { 2166 for (j=0; j<bs; j++) { 2167 x[bs*r[i]+j] = t[bs*i+j]; 2168 } 2169 } 2170 2171 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2172 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2173 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2174 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2175 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2176 PetscFunctionReturn(0); 2177 } 2178 2179 #undef __FUNCT__ 2180 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 2181 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 2182 { 2183 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2184 IS iscol=a->col,isrow=a->row; 2185 PetscErrorCode ierr; 2186 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2187 PetscInt i,n=a->mbs,j; 2188 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 2189 const MatScalar *aa=a->a,*v; 2190 PetscScalar *x,*t,*ls; 2191 const PetscScalar *b; 2192 PetscFunctionBegin; 2193 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2194 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2195 t = a->solve_work; 2196 2197 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2198 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2199 2200 /* copy the b into temp work space according to permutation */ 2201 for (i=0; i<n; i++) { 2202 for (j=0; j<bs; j++) { 2203 t[i*bs+j] = b[c[i]*bs+j]; 2204 } 2205 } 2206 2207 2208 /* forward solve the upper triangular transpose */ 2209 ls = a->solve_work + A->cmap->n; 2210 for (i=0; i<n; i++){ 2211 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2212 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 2213 v = aa + bs2*(diag[i] - 1); 2214 vi = aj + diag[i] - 1; 2215 nz = diag[i] - diag[i+1] - 1; 2216 for(j=0;j>-nz;j--){ 2217 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2218 v -= bs2; 2219 } 2220 } 2221 2222 /* backward solve the lower triangular transpose */ 2223 for (i=n-1; i>=0; i--) { 2224 v = aa + bs2*ai[i]; 2225 vi = aj + ai[i]; 2226 nz = ai[i+1] - ai[i]; 2227 for(j=0;j<nz;j++){ 2228 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2229 v += bs2; 2230 } 2231 } 2232 2233 /* copy t into x according to permutation */ 2234 for (i=0; i<n; i++) { 2235 for (j=0; j<bs; j++) { 2236 x[bs*r[i]+j] = t[bs*i+j]; 2237 } 2238 } 2239 2240 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2241 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2242 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2243 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2244 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2245 PetscFunctionReturn(0); 2246 } 2247 2248 /* bs = 15 for PFLOTRAN */ 2249 2250 #undef __FUNCT__ 2251 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering" 2252 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering(Mat A,Vec bb,Vec xx) 2253 { 2254 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2255 PetscErrorCode ierr; 2256 const PetscInt *ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2257 PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 2258 MatScalar *aa=a->a,*v; 2259 PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 2260 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 2261 PetscScalar *x,*b,*t; 2262 2263 PetscFunctionBegin; 2264 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2265 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2266 t = a->solve_work; 2267 2268 /* forward solve the lower triangular */ 2269 idx = 0; 2270 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2271 t[5] = b[5+idx]; t[6] = b[6+idx]; t[7] = b[7+idx]; t[8] = b[8+idx]; t[9] = b[9+idx]; 2272 t[10] = b[10+idx]; t[11] = b[11+idx]; t[12] = b[12+idx]; t[13] = b[13+idx]; t[14] = b[14+idx]; 2273 2274 for (i=1; i<n; i++) { 2275 v = aa + bs2*ai[i]; 2276 vi = aj + ai[i]; 2277 nz = ai[i+1] - ai[i]; 2278 idx = bs*i; 2279 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; s4 = b[3+idx]; s5 = b[4+idx]; 2280 s6 = b[5+idx]; s7 = b[6+idx]; s8 = b[7+idx]; s9 = b[8+idx]; s10 = b[9+idx]; 2281 s11 = b[10+idx]; s12 = b[11+idx]; s13 = b[12+idx]; s14 = b[13+idx]; s15 = b[14+idx]; 2282 for(m=0;m<nz;m++){ 2283 idx = bs*vi[m]; 2284 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2285 x6 = t[5+idx]; x7 = t[6+idx]; x8 = t[7+idx]; x9 = t[8+idx]; x10 = t[9+idx]; 2286 x11 = t[10+idx]; x12 = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx]; 2287 2288 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2289 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2290 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2291 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2292 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2293 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2294 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2295 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2296 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2297 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2298 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2299 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2300 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2301 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2302 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2303 2304 v += bs2; 2305 } 2306 idx = bs*i; 2307 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] = s5; 2308 t[5+idx] = s6; t[6+idx] = s7; t[7+idx] = s8; t[8+idx] = s9; t[9+idx] = s10; 2309 t[10+idx] = s11; t[11+idx] = s12; t[12+idx] = s13; t[13+idx] = s14; t[14+idx] = s15; 2310 2311 } 2312 /* backward solve the upper triangular */ 2313 for (i=n-1; i>=0; i--){ 2314 v = aa + bs2*(adiag[i+1]+1); 2315 vi = aj + adiag[i+1]+1; 2316 nz = adiag[i] - adiag[i+1] - 1; 2317 idt = bs*i; 2318 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2319 s6 = t[5+idt]; s7 = t[6+idt]; s8 = t[7+idt]; s9 = t[8+idt]; s10 = t[9+idt]; 2320 s11 = t[10+idt]; s12 = t[11+idt]; s13 = t[12+idt]; s14 = t[13+idt]; s15 = t[14+idt]; 2321 2322 for(m=0;m<nz;m++){ 2323 idx = bs*vi[m]; 2324 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2325 x6 = t[5+idx]; x7 = t[6+idx]; x8 = t[7+idx]; x9 = t[8+idx]; x10 = t[9+idx]; 2326 x11 = t[10+idx]; x12 = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx]; 2327 2328 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2329 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2330 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2331 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2332 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2333 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2334 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2335 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2336 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2337 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2338 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2339 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2340 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2341 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2342 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2343 2344 v += bs2; 2345 } 2346 idc = bs*i; 2347 2348 x[idc] = t[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 2349 x[1+idc] = t[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 2350 x[2+idc] = t[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 2351 x[3+idc] = t[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 2352 x[4+idc] = t[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 2353 x[5+idc] = t[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 2354 x[6+idc] = t[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 2355 x[7+idc] = t[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 2356 x[8+idc] = t[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 2357 x[9+idc] = t[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 2358 x[10+idc] = t[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 2359 x[11+idc] = t[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 2360 x[12+idc] = t[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 2361 x[13+idc] = t[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 2362 x[14+idc] = t[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 2363 2364 } 2365 2366 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2367 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2368 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2369 PetscFunctionReturn(0); 2370 } 2371 2372 #undef __FUNCT__ 2373 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 2374 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 2375 { 2376 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2377 IS iscol=a->col,isrow=a->row; 2378 PetscErrorCode ierr; 2379 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 2380 PetscInt i,n=a->mbs,nz,idx,idt,idc; 2381 MatScalar *aa=a->a,*v; 2382 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2383 PetscScalar *x,*b,*t; 2384 2385 PetscFunctionBegin; 2386 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2387 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2388 t = a->solve_work; 2389 2390 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2391 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2392 2393 /* forward solve the lower triangular */ 2394 idx = 7*(*r++); 2395 t[0] = b[idx]; t[1] = b[1+idx]; 2396 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2397 t[5] = b[5+idx]; t[6] = b[6+idx]; 2398 2399 for (i=1; i<n; i++) { 2400 v = aa + 49*ai[i]; 2401 vi = aj + ai[i]; 2402 nz = diag[i] - ai[i]; 2403 idx = 7*(*r++); 2404 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2405 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2406 while (nz--) { 2407 idx = 7*(*vi++); 2408 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2409 x4 = t[3+idx];x5 = t[4+idx]; 2410 x6 = t[5+idx];x7 = t[6+idx]; 2411 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2412 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2413 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2414 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2415 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2416 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2417 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2418 v += 49; 2419 } 2420 idx = 7*i; 2421 t[idx] = s1;t[1+idx] = s2; 2422 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2423 t[5+idx] = s6;t[6+idx] = s7; 2424 } 2425 /* backward solve the upper triangular */ 2426 for (i=n-1; i>=0; i--){ 2427 v = aa + 49*diag[i] + 49; 2428 vi = aj + diag[i] + 1; 2429 nz = ai[i+1] - diag[i] - 1; 2430 idt = 7*i; 2431 s1 = t[idt]; s2 = t[1+idt]; 2432 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2433 s6 = t[5+idt];s7 = t[6+idt]; 2434 while (nz--) { 2435 idx = 7*(*vi++); 2436 x1 = t[idx]; x2 = t[1+idx]; 2437 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2438 x6 = t[5+idx]; x7 = t[6+idx]; 2439 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2440 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2441 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2442 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2443 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2444 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2445 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2446 v += 49; 2447 } 2448 idc = 7*(*c--); 2449 v = aa + 49*diag[i]; 2450 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2451 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2452 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2453 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2454 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2455 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2456 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2457 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2458 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2459 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2460 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2461 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2462 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2463 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2464 } 2465 2466 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2467 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2468 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2469 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2470 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2471 PetscFunctionReturn(0); 2472 } 2473 2474 #undef __FUNCT__ 2475 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 2476 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2477 { 2478 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2479 IS iscol=a->col,isrow=a->row; 2480 PetscErrorCode ierr; 2481 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 2482 PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 2483 MatScalar *aa=a->a,*v; 2484 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2485 PetscScalar *x,*b,*t; 2486 2487 PetscFunctionBegin; 2488 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2489 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2490 t = a->solve_work; 2491 2492 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2493 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2494 2495 /* forward solve the lower triangular */ 2496 idx = 7*r[0]; 2497 t[0] = b[idx]; t[1] = b[1+idx]; 2498 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2499 t[5] = b[5+idx]; t[6] = b[6+idx]; 2500 2501 for (i=1; i<n; i++) { 2502 v = aa + 49*ai[i]; 2503 vi = aj + ai[i]; 2504 nz = ai[i+1] - ai[i]; 2505 idx = 7*r[i]; 2506 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2507 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2508 for(m=0;m<nz;m++){ 2509 idx = 7*vi[m]; 2510 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2511 x4 = t[3+idx];x5 = t[4+idx]; 2512 x6 = t[5+idx];x7 = t[6+idx]; 2513 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2514 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2515 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2516 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2517 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2518 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2519 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2520 v += 49; 2521 } 2522 idx = 7*i; 2523 t[idx] = s1;t[1+idx] = s2; 2524 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2525 t[5+idx] = s6;t[6+idx] = s7; 2526 } 2527 /* backward solve the upper triangular */ 2528 for (i=n-1; i>=0; i--){ 2529 v = aa + 49*(adiag[i+1]+1); 2530 vi = aj + adiag[i+1]+1; 2531 nz = adiag[i] - adiag[i+1] - 1; 2532 idt = 7*i; 2533 s1 = t[idt]; s2 = t[1+idt]; 2534 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2535 s6 = t[5+idt];s7 = t[6+idt]; 2536 for(m=0;m<nz;m++){ 2537 idx = 7*vi[m]; 2538 x1 = t[idx]; x2 = t[1+idx]; 2539 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2540 x6 = t[5+idx]; x7 = t[6+idx]; 2541 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2542 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2543 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2544 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2545 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2546 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2547 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2548 v += 49; 2549 } 2550 idc = 7*c[i]; 2551 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2552 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2553 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2554 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2555 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2556 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2557 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2558 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2559 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2560 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2561 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2562 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2563 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2564 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2565 } 2566 2567 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2568 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2569 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2570 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2571 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2572 PetscFunctionReturn(0); 2573 } 2574 2575 #undef __FUNCT__ 2576 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 2577 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2578 { 2579 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2580 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2581 PetscErrorCode ierr; 2582 PetscInt *diag = a->diag,jdx; 2583 const MatScalar *aa=a->a,*v; 2584 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2585 const PetscScalar *b; 2586 2587 PetscFunctionBegin; 2588 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2589 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2590 /* forward solve the lower triangular */ 2591 idx = 0; 2592 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2593 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2594 x[6] = b[6+idx]; 2595 for (i=1; i<n; i++) { 2596 v = aa + 49*ai[i]; 2597 vi = aj + ai[i]; 2598 nz = diag[i] - ai[i]; 2599 idx = 7*i; 2600 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2601 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2602 s7 = b[6+idx]; 2603 while (nz--) { 2604 jdx = 7*(*vi++); 2605 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2606 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2607 x7 = x[6+jdx]; 2608 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2609 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2610 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2611 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2612 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2613 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2614 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2615 v += 49; 2616 } 2617 x[idx] = s1; 2618 x[1+idx] = s2; 2619 x[2+idx] = s3; 2620 x[3+idx] = s4; 2621 x[4+idx] = s5; 2622 x[5+idx] = s6; 2623 x[6+idx] = s7; 2624 } 2625 /* backward solve the upper triangular */ 2626 for (i=n-1; i>=0; i--){ 2627 v = aa + 49*diag[i] + 49; 2628 vi = aj + diag[i] + 1; 2629 nz = ai[i+1] - diag[i] - 1; 2630 idt = 7*i; 2631 s1 = x[idt]; s2 = x[1+idt]; 2632 s3 = x[2+idt]; s4 = x[3+idt]; 2633 s5 = x[4+idt]; s6 = x[5+idt]; 2634 s7 = x[6+idt]; 2635 while (nz--) { 2636 idx = 7*(*vi++); 2637 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2638 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2639 x7 = x[6+idx]; 2640 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2641 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2642 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2643 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2644 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2645 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2646 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2647 v += 49; 2648 } 2649 v = aa + 49*diag[i]; 2650 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2651 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2652 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2653 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2654 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2655 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2656 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2657 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2658 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2659 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2660 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2661 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2662 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2663 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2664 } 2665 2666 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2667 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2668 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2669 PetscFunctionReturn(0); 2670 } 2671 2672 #undef __FUNCT__ 2673 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 2674 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 2675 { 2676 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2677 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2678 PetscErrorCode ierr; 2679 PetscInt idx,jdx,idt; 2680 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2681 const MatScalar *aa=a->a,*v; 2682 PetscScalar *x; 2683 const PetscScalar *b; 2684 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2685 2686 PetscFunctionBegin; 2687 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2688 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2689 /* forward solve the lower triangular */ 2690 idx = 0; 2691 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2692 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 2693 for (i=1; i<n; i++) { 2694 v = aa + bs2*ai[i]; 2695 vi = aj + ai[i]; 2696 nz = ai[i+1] - ai[i]; 2697 idx = bs*i; 2698 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2699 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2700 for(k=0;k<nz;k++) { 2701 jdx = bs*vi[k]; 2702 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2703 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 2704 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2705 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2706 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2707 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2708 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2709 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2710 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2711 v += bs2; 2712 } 2713 2714 x[idx] = s1; 2715 x[1+idx] = s2; 2716 x[2+idx] = s3; 2717 x[3+idx] = s4; 2718 x[4+idx] = s5; 2719 x[5+idx] = s6; 2720 x[6+idx] = s7; 2721 } 2722 2723 /* backward solve the upper triangular */ 2724 for (i=n-1; i>=0; i--){ 2725 v = aa + bs2*(adiag[i+1]+1); 2726 vi = aj + adiag[i+1]+1; 2727 nz = adiag[i] - adiag[i+1]-1; 2728 idt = bs*i; 2729 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2730 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 2731 for(k=0;k<nz;k++) { 2732 idx = bs*vi[k]; 2733 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2734 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 2735 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2736 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2737 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2738 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2739 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2740 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2741 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2742 v += bs2; 2743 } 2744 /* x = inv_diagonal*x */ 2745 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2746 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2747 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2748 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2749 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2750 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2751 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2752 } 2753 2754 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2755 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2756 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2757 PetscFunctionReturn(0); 2758 } 2759 2760 #undef __FUNCT__ 2761 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 2762 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 2763 { 2764 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2765 IS iscol=a->col,isrow=a->row; 2766 PetscErrorCode ierr; 2767 const PetscInt *r,*c,*rout,*cout; 2768 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2769 const MatScalar *aa=a->a,*v; 2770 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2771 const PetscScalar *b; 2772 PetscFunctionBegin; 2773 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2774 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2775 t = a->solve_work; 2776 2777 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2778 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2779 2780 /* forward solve the lower triangular */ 2781 idx = 6*(*r++); 2782 t[0] = b[idx]; t[1] = b[1+idx]; 2783 t[2] = b[2+idx]; t[3] = b[3+idx]; 2784 t[4] = b[4+idx]; t[5] = b[5+idx]; 2785 for (i=1; i<n; i++) { 2786 v = aa + 36*ai[i]; 2787 vi = aj + ai[i]; 2788 nz = diag[i] - ai[i]; 2789 idx = 6*(*r++); 2790 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2791 s5 = b[4+idx]; s6 = b[5+idx]; 2792 while (nz--) { 2793 idx = 6*(*vi++); 2794 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2795 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2796 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2797 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2798 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2799 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2800 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2801 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2802 v += 36; 2803 } 2804 idx = 6*i; 2805 t[idx] = s1;t[1+idx] = s2; 2806 t[2+idx] = s3;t[3+idx] = s4; 2807 t[4+idx] = s5;t[5+idx] = s6; 2808 } 2809 /* backward solve the upper triangular */ 2810 for (i=n-1; i>=0; i--){ 2811 v = aa + 36*diag[i] + 36; 2812 vi = aj + diag[i] + 1; 2813 nz = ai[i+1] - diag[i] - 1; 2814 idt = 6*i; 2815 s1 = t[idt]; s2 = t[1+idt]; 2816 s3 = t[2+idt];s4 = t[3+idt]; 2817 s5 = t[4+idt];s6 = t[5+idt]; 2818 while (nz--) { 2819 idx = 6*(*vi++); 2820 x1 = t[idx]; x2 = t[1+idx]; 2821 x3 = t[2+idx]; x4 = t[3+idx]; 2822 x5 = t[4+idx]; x6 = t[5+idx]; 2823 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2824 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2825 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2826 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2827 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2828 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2829 v += 36; 2830 } 2831 idc = 6*(*c--); 2832 v = aa + 36*diag[i]; 2833 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2834 v[18]*s4+v[24]*s5+v[30]*s6; 2835 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2836 v[19]*s4+v[25]*s5+v[31]*s6; 2837 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2838 v[20]*s4+v[26]*s5+v[32]*s6; 2839 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2840 v[21]*s4+v[27]*s5+v[33]*s6; 2841 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2842 v[22]*s4+v[28]*s5+v[34]*s6; 2843 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2844 v[23]*s4+v[29]*s5+v[35]*s6; 2845 } 2846 2847 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2848 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2849 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2850 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2851 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2852 PetscFunctionReturn(0); 2853 } 2854 2855 #undef __FUNCT__ 2856 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 2857 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 2858 { 2859 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2860 IS iscol=a->col,isrow=a->row; 2861 PetscErrorCode ierr; 2862 const PetscInt *r,*c,*rout,*cout; 2863 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2864 const MatScalar *aa=a->a,*v; 2865 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2866 const PetscScalar *b; 2867 PetscFunctionBegin; 2868 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2869 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2870 t = a->solve_work; 2871 2872 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2873 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2874 2875 /* forward solve the lower triangular */ 2876 idx = 6*r[0]; 2877 t[0] = b[idx]; t[1] = b[1+idx]; 2878 t[2] = b[2+idx]; t[3] = b[3+idx]; 2879 t[4] = b[4+idx]; t[5] = b[5+idx]; 2880 for (i=1; i<n; i++) { 2881 v = aa + 36*ai[i]; 2882 vi = aj + ai[i]; 2883 nz = ai[i+1] - ai[i]; 2884 idx = 6*r[i]; 2885 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2886 s5 = b[4+idx]; s6 = b[5+idx]; 2887 for(m=0;m<nz;m++){ 2888 idx = 6*vi[m]; 2889 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2890 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2891 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2892 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2893 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2894 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2895 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2896 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2897 v += 36; 2898 } 2899 idx = 6*i; 2900 t[idx] = s1;t[1+idx] = s2; 2901 t[2+idx] = s3;t[3+idx] = s4; 2902 t[4+idx] = s5;t[5+idx] = s6; 2903 } 2904 /* backward solve the upper triangular */ 2905 for (i=n-1; i>=0; i--){ 2906 v = aa + 36*(adiag[i+1]+1); 2907 vi = aj + adiag[i+1]+1; 2908 nz = adiag[i] - adiag[i+1] - 1; 2909 idt = 6*i; 2910 s1 = t[idt]; s2 = t[1+idt]; 2911 s3 = t[2+idt];s4 = t[3+idt]; 2912 s5 = t[4+idt];s6 = t[5+idt]; 2913 for(m=0;m<nz;m++){ 2914 idx = 6*vi[m]; 2915 x1 = t[idx]; x2 = t[1+idx]; 2916 x3 = t[2+idx]; x4 = t[3+idx]; 2917 x5 = t[4+idx]; x6 = t[5+idx]; 2918 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2919 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2920 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2921 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2922 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2923 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2924 v += 36; 2925 } 2926 idc = 6*c[i]; 2927 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2928 v[18]*s4+v[24]*s5+v[30]*s6; 2929 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2930 v[19]*s4+v[25]*s5+v[31]*s6; 2931 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2932 v[20]*s4+v[26]*s5+v[32]*s6; 2933 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2934 v[21]*s4+v[27]*s5+v[33]*s6; 2935 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2936 v[22]*s4+v[28]*s5+v[34]*s6; 2937 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2938 v[23]*s4+v[29]*s5+v[35]*s6; 2939 } 2940 2941 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2942 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2943 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2944 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2945 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2946 PetscFunctionReturn(0); 2947 } 2948 2949 #undef __FUNCT__ 2950 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 2951 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2952 { 2953 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2954 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2955 PetscErrorCode ierr; 2956 PetscInt *diag = a->diag,jdx; 2957 const MatScalar *aa=a->a,*v; 2958 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2959 const PetscScalar *b; 2960 2961 PetscFunctionBegin; 2962 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2963 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2964 /* forward solve the lower triangular */ 2965 idx = 0; 2966 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2967 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2968 for (i=1; i<n; i++) { 2969 v = aa + 36*ai[i]; 2970 vi = aj + ai[i]; 2971 nz = diag[i] - ai[i]; 2972 idx = 6*i; 2973 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2974 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2975 while (nz--) { 2976 jdx = 6*(*vi++); 2977 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2978 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2979 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2980 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2981 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2982 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2983 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2984 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2985 v += 36; 2986 } 2987 x[idx] = s1; 2988 x[1+idx] = s2; 2989 x[2+idx] = s3; 2990 x[3+idx] = s4; 2991 x[4+idx] = s5; 2992 x[5+idx] = s6; 2993 } 2994 /* backward solve the upper triangular */ 2995 for (i=n-1; i>=0; i--){ 2996 v = aa + 36*diag[i] + 36; 2997 vi = aj + diag[i] + 1; 2998 nz = ai[i+1] - diag[i] - 1; 2999 idt = 6*i; 3000 s1 = x[idt]; s2 = x[1+idt]; 3001 s3 = x[2+idt]; s4 = x[3+idt]; 3002 s5 = x[4+idt]; s6 = x[5+idt]; 3003 while (nz--) { 3004 idx = 6*(*vi++); 3005 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3006 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3007 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3008 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3009 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3010 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3011 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3012 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3013 v += 36; 3014 } 3015 v = aa + 36*diag[i]; 3016 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3017 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3018 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3019 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3020 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3021 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3022 } 3023 3024 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3025 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3026 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3027 PetscFunctionReturn(0); 3028 } 3029 3030 #undef __FUNCT__ 3031 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 3032 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 3033 { 3034 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3035 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3036 PetscErrorCode ierr; 3037 PetscInt idx,jdx,idt; 3038 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3039 const MatScalar *aa=a->a,*v; 3040 PetscScalar *x; 3041 const PetscScalar *b; 3042 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3043 3044 PetscFunctionBegin; 3045 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3046 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3047 /* forward solve the lower triangular */ 3048 idx = 0; 3049 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3050 x[4] = b[4+idx];x[5] = b[5+idx]; 3051 for (i=1; i<n; i++) { 3052 v = aa + bs2*ai[i]; 3053 vi = aj + ai[i]; 3054 nz = ai[i+1] - ai[i]; 3055 idx = bs*i; 3056 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3057 s5 = b[4+idx];s6 = b[5+idx]; 3058 for(k=0;k<nz;k++){ 3059 jdx = bs*vi[k]; 3060 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3061 x5 = x[4+jdx]; x6 = x[5+jdx]; 3062 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3063 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3064 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3065 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3066 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3067 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3068 v += bs2; 3069 } 3070 3071 x[idx] = s1; 3072 x[1+idx] = s2; 3073 x[2+idx] = s3; 3074 x[3+idx] = s4; 3075 x[4+idx] = s5; 3076 x[5+idx] = s6; 3077 } 3078 3079 /* backward solve the upper triangular */ 3080 for (i=n-1; i>=0; i--){ 3081 v = aa + bs2*(adiag[i+1]+1); 3082 vi = aj + adiag[i+1]+1; 3083 nz = adiag[i] - adiag[i+1]-1; 3084 idt = bs*i; 3085 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3086 s5 = x[4+idt];s6 = x[5+idt]; 3087 for(k=0;k<nz;k++){ 3088 idx = bs*vi[k]; 3089 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3090 x5 = x[4+idx];x6 = x[5+idx]; 3091 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3092 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3093 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3094 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3095 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3096 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3097 v += bs2; 3098 } 3099 /* x = inv_diagonal*x */ 3100 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3101 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3102 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3103 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3104 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3105 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3106 } 3107 3108 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3109 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3110 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3111 PetscFunctionReturn(0); 3112 } 3113 3114 #undef __FUNCT__ 3115 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 3116 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 3117 { 3118 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3119 IS iscol=a->col,isrow=a->row; 3120 PetscErrorCode ierr; 3121 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3122 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3123 const MatScalar *aa=a->a,*v; 3124 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3125 const PetscScalar *b; 3126 3127 PetscFunctionBegin; 3128 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3129 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3130 t = a->solve_work; 3131 3132 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3133 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3134 3135 /* forward solve the lower triangular */ 3136 idx = 5*(*r++); 3137 t[0] = b[idx]; t[1] = b[1+idx]; 3138 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3139 for (i=1; i<n; i++) { 3140 v = aa + 25*ai[i]; 3141 vi = aj + ai[i]; 3142 nz = diag[i] - ai[i]; 3143 idx = 5*(*r++); 3144 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3145 s5 = b[4+idx]; 3146 while (nz--) { 3147 idx = 5*(*vi++); 3148 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3149 x4 = t[3+idx];x5 = t[4+idx]; 3150 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3151 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3152 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3153 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3154 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3155 v += 25; 3156 } 3157 idx = 5*i; 3158 t[idx] = s1;t[1+idx] = s2; 3159 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3160 } 3161 /* backward solve the upper triangular */ 3162 for (i=n-1; i>=0; i--){ 3163 v = aa + 25*diag[i] + 25; 3164 vi = aj + diag[i] + 1; 3165 nz = ai[i+1] - diag[i] - 1; 3166 idt = 5*i; 3167 s1 = t[idt]; s2 = t[1+idt]; 3168 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3169 while (nz--) { 3170 idx = 5*(*vi++); 3171 x1 = t[idx]; x2 = t[1+idx]; 3172 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3173 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3174 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3175 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3176 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3177 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3178 v += 25; 3179 } 3180 idc = 5*(*c--); 3181 v = aa + 25*diag[i]; 3182 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3183 v[15]*s4+v[20]*s5; 3184 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3185 v[16]*s4+v[21]*s5; 3186 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3187 v[17]*s4+v[22]*s5; 3188 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3189 v[18]*s4+v[23]*s5; 3190 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3191 v[19]*s4+v[24]*s5; 3192 } 3193 3194 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3195 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3196 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3197 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3198 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3199 PetscFunctionReturn(0); 3200 } 3201 3202 #undef __FUNCT__ 3203 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 3204 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 3205 { 3206 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3207 IS iscol=a->col,isrow=a->row; 3208 PetscErrorCode ierr; 3209 const PetscInt *r,*c,*rout,*cout; 3210 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 3211 const MatScalar *aa=a->a,*v; 3212 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3213 const PetscScalar *b; 3214 3215 PetscFunctionBegin; 3216 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3217 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3218 t = a->solve_work; 3219 3220 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3221 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3222 3223 /* forward solve the lower triangular */ 3224 idx = 5*r[0]; 3225 t[0] = b[idx]; t[1] = b[1+idx]; 3226 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3227 for (i=1; i<n; i++) { 3228 v = aa + 25*ai[i]; 3229 vi = aj + ai[i]; 3230 nz = ai[i+1] - ai[i]; 3231 idx = 5*r[i]; 3232 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3233 s5 = b[4+idx]; 3234 for(m=0;m<nz;m++){ 3235 idx = 5*vi[m]; 3236 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3237 x4 = t[3+idx];x5 = t[4+idx]; 3238 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3239 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3240 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3241 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3242 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3243 v += 25; 3244 } 3245 idx = 5*i; 3246 t[idx] = s1;t[1+idx] = s2; 3247 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3248 } 3249 /* backward solve the upper triangular */ 3250 for (i=n-1; i>=0; i--){ 3251 v = aa + 25*(adiag[i+1]+1); 3252 vi = aj + adiag[i+1]+1; 3253 nz = adiag[i] - adiag[i+1] - 1; 3254 idt = 5*i; 3255 s1 = t[idt]; s2 = t[1+idt]; 3256 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3257 for(m=0;m<nz;m++){ 3258 idx = 5*vi[m]; 3259 x1 = t[idx]; x2 = t[1+idx]; 3260 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3261 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3262 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3263 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3264 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3265 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3266 v += 25; 3267 } 3268 idc = 5*c[i]; 3269 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3270 v[15]*s4+v[20]*s5; 3271 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3272 v[16]*s4+v[21]*s5; 3273 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3274 v[17]*s4+v[22]*s5; 3275 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3276 v[18]*s4+v[23]*s5; 3277 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3278 v[19]*s4+v[24]*s5; 3279 } 3280 3281 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3282 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3283 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3284 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3285 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3286 PetscFunctionReturn(0); 3287 } 3288 3289 #undef __FUNCT__ 3290 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 3291 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3292 { 3293 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3294 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 3295 PetscErrorCode ierr; 3296 PetscInt *diag = a->diag,jdx; 3297 const MatScalar *aa=a->a,*v; 3298 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3299 const PetscScalar *b; 3300 3301 PetscFunctionBegin; 3302 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3303 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3304 /* forward solve the lower triangular */ 3305 idx = 0; 3306 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3307 for (i=1; i<n; i++) { 3308 v = aa + 25*ai[i]; 3309 vi = aj + ai[i]; 3310 nz = diag[i] - ai[i]; 3311 idx = 5*i; 3312 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3313 while (nz--) { 3314 jdx = 5*(*vi++); 3315 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3316 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3317 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3318 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3319 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3320 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3321 v += 25; 3322 } 3323 x[idx] = s1; 3324 x[1+idx] = s2; 3325 x[2+idx] = s3; 3326 x[3+idx] = s4; 3327 x[4+idx] = s5; 3328 } 3329 /* backward solve the upper triangular */ 3330 for (i=n-1; i>=0; i--){ 3331 v = aa + 25*diag[i] + 25; 3332 vi = aj + diag[i] + 1; 3333 nz = ai[i+1] - diag[i] - 1; 3334 idt = 5*i; 3335 s1 = x[idt]; s2 = x[1+idt]; 3336 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3337 while (nz--) { 3338 idx = 5*(*vi++); 3339 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3340 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3341 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3342 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3343 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3344 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3345 v += 25; 3346 } 3347 v = aa + 25*diag[i]; 3348 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3349 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3350 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3351 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3352 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3353 } 3354 3355 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3356 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3357 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3358 PetscFunctionReturn(0); 3359 } 3360 3361 #undef __FUNCT__ 3362 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 3363 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 3364 { 3365 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3366 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 3367 PetscErrorCode ierr; 3368 PetscInt jdx; 3369 const MatScalar *aa=a->a,*v; 3370 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3371 const PetscScalar *b; 3372 3373 PetscFunctionBegin; 3374 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3375 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3376 /* forward solve the lower triangular */ 3377 idx = 0; 3378 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3379 for (i=1; i<n; i++) { 3380 v = aa + 25*ai[i]; 3381 vi = aj + ai[i]; 3382 nz = ai[i+1] - ai[i]; 3383 idx = 5*i; 3384 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3385 for(k=0;k<nz;k++) { 3386 jdx = 5*vi[k]; 3387 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3388 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3389 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3390 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3391 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3392 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3393 v += 25; 3394 } 3395 x[idx] = s1; 3396 x[1+idx] = s2; 3397 x[2+idx] = s3; 3398 x[3+idx] = s4; 3399 x[4+idx] = s5; 3400 } 3401 3402 /* backward solve the upper triangular */ 3403 for (i=n-1; i>=0; i--){ 3404 v = aa + 25*(adiag[i+1]+1); 3405 vi = aj + adiag[i+1]+1; 3406 nz = adiag[i] - adiag[i+1]-1; 3407 idt = 5*i; 3408 s1 = x[idt]; s2 = x[1+idt]; 3409 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3410 for(k=0;k<nz;k++){ 3411 idx = 5*vi[k]; 3412 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3413 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3414 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3415 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3416 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3417 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3418 v += 25; 3419 } 3420 /* x = inv_diagonal*x */ 3421 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3422 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3423 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3424 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3425 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3426 } 3427 3428 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3429 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3430 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3431 PetscFunctionReturn(0); 3432 } 3433 3434 #undef __FUNCT__ 3435 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 3436 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 3437 { 3438 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3439 IS iscol=a->col,isrow=a->row; 3440 PetscErrorCode ierr; 3441 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3442 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3443 const MatScalar *aa=a->a,*v; 3444 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3445 const PetscScalar *b; 3446 3447 PetscFunctionBegin; 3448 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3449 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3450 t = a->solve_work; 3451 3452 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3453 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3454 3455 /* forward solve the lower triangular */ 3456 idx = 4*(*r++); 3457 t[0] = b[idx]; t[1] = b[1+idx]; 3458 t[2] = b[2+idx]; t[3] = b[3+idx]; 3459 for (i=1; i<n; i++) { 3460 v = aa + 16*ai[i]; 3461 vi = aj + ai[i]; 3462 nz = diag[i] - ai[i]; 3463 idx = 4*(*r++); 3464 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3465 while (nz--) { 3466 idx = 4*(*vi++); 3467 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3468 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3469 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3470 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3471 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3472 v += 16; 3473 } 3474 idx = 4*i; 3475 t[idx] = s1;t[1+idx] = s2; 3476 t[2+idx] = s3;t[3+idx] = s4; 3477 } 3478 /* backward solve the upper triangular */ 3479 for (i=n-1; i>=0; i--){ 3480 v = aa + 16*diag[i] + 16; 3481 vi = aj + diag[i] + 1; 3482 nz = ai[i+1] - diag[i] - 1; 3483 idt = 4*i; 3484 s1 = t[idt]; s2 = t[1+idt]; 3485 s3 = t[2+idt];s4 = t[3+idt]; 3486 while (nz--) { 3487 idx = 4*(*vi++); 3488 x1 = t[idx]; x2 = t[1+idx]; 3489 x3 = t[2+idx]; x4 = t[3+idx]; 3490 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3491 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3492 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3493 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3494 v += 16; 3495 } 3496 idc = 4*(*c--); 3497 v = aa + 16*diag[i]; 3498 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3499 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3500 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3501 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3502 } 3503 3504 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3505 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3506 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3507 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3508 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3509 PetscFunctionReturn(0); 3510 } 3511 3512 #undef __FUNCT__ 3513 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 3514 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 3515 { 3516 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3517 IS iscol=a->col,isrow=a->row; 3518 PetscErrorCode ierr; 3519 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 3520 const PetscInt *r,*c,*rout,*cout; 3521 const MatScalar *aa=a->a,*v; 3522 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3523 const PetscScalar *b; 3524 3525 PetscFunctionBegin; 3526 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3527 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3528 t = a->solve_work; 3529 3530 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3531 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3532 3533 /* forward solve the lower triangular */ 3534 idx = 4*r[0]; 3535 t[0] = b[idx]; t[1] = b[1+idx]; 3536 t[2] = b[2+idx]; t[3] = b[3+idx]; 3537 for (i=1; i<n; i++) { 3538 v = aa + 16*ai[i]; 3539 vi = aj + ai[i]; 3540 nz = ai[i+1] - ai[i]; 3541 idx = 4*r[i]; 3542 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3543 for(m=0;m<nz;m++){ 3544 idx = 4*vi[m]; 3545 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3546 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3547 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3548 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3549 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3550 v += 16; 3551 } 3552 idx = 4*i; 3553 t[idx] = s1;t[1+idx] = s2; 3554 t[2+idx] = s3;t[3+idx] = s4; 3555 } 3556 /* backward solve the upper triangular */ 3557 for (i=n-1; i>=0; i--){ 3558 v = aa + 16*(adiag[i+1]+1); 3559 vi = aj + adiag[i+1]+1; 3560 nz = adiag[i] - adiag[i+1] - 1; 3561 idt = 4*i; 3562 s1 = t[idt]; s2 = t[1+idt]; 3563 s3 = t[2+idt];s4 = t[3+idt]; 3564 for(m=0;m<nz;m++){ 3565 idx = 4*vi[m]; 3566 x1 = t[idx]; x2 = t[1+idx]; 3567 x3 = t[2+idx]; x4 = t[3+idx]; 3568 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3569 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3570 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3571 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3572 v += 16; 3573 } 3574 idc = 4*c[i]; 3575 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3576 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3577 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3578 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3579 } 3580 3581 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3582 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3583 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3584 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3585 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3586 PetscFunctionReturn(0); 3587 } 3588 3589 #undef __FUNCT__ 3590 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3591 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3592 { 3593 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3594 IS iscol=a->col,isrow=a->row; 3595 PetscErrorCode ierr; 3596 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3597 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3598 const MatScalar *aa=a->a,*v; 3599 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3600 PetscScalar *x; 3601 const PetscScalar *b; 3602 3603 PetscFunctionBegin; 3604 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3605 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3606 t = (MatScalar *)a->solve_work; 3607 3608 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3609 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3610 3611 /* forward solve the lower triangular */ 3612 idx = 4*(*r++); 3613 t[0] = (MatScalar)b[idx]; 3614 t[1] = (MatScalar)b[1+idx]; 3615 t[2] = (MatScalar)b[2+idx]; 3616 t[3] = (MatScalar)b[3+idx]; 3617 for (i=1; i<n; i++) { 3618 v = aa + 16*ai[i]; 3619 vi = aj + ai[i]; 3620 nz = diag[i] - ai[i]; 3621 idx = 4*(*r++); 3622 s1 = (MatScalar)b[idx]; 3623 s2 = (MatScalar)b[1+idx]; 3624 s3 = (MatScalar)b[2+idx]; 3625 s4 = (MatScalar)b[3+idx]; 3626 while (nz--) { 3627 idx = 4*(*vi++); 3628 x1 = t[idx]; 3629 x2 = t[1+idx]; 3630 x3 = t[2+idx]; 3631 x4 = t[3+idx]; 3632 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3633 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3634 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3635 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3636 v += 16; 3637 } 3638 idx = 4*i; 3639 t[idx] = s1; 3640 t[1+idx] = s2; 3641 t[2+idx] = s3; 3642 t[3+idx] = s4; 3643 } 3644 /* backward solve the upper triangular */ 3645 for (i=n-1; i>=0; i--){ 3646 v = aa + 16*diag[i] + 16; 3647 vi = aj + diag[i] + 1; 3648 nz = ai[i+1] - diag[i] - 1; 3649 idt = 4*i; 3650 s1 = t[idt]; 3651 s2 = t[1+idt]; 3652 s3 = t[2+idt]; 3653 s4 = t[3+idt]; 3654 while (nz--) { 3655 idx = 4*(*vi++); 3656 x1 = t[idx]; 3657 x2 = t[1+idx]; 3658 x3 = t[2+idx]; 3659 x4 = t[3+idx]; 3660 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3661 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3662 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3663 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3664 v += 16; 3665 } 3666 idc = 4*(*c--); 3667 v = aa + 16*diag[i]; 3668 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3669 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3670 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3671 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3672 x[idc] = (PetscScalar)t[idt]; 3673 x[1+idc] = (PetscScalar)t[1+idt]; 3674 x[2+idc] = (PetscScalar)t[2+idt]; 3675 x[3+idc] = (PetscScalar)t[3+idt]; 3676 } 3677 3678 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3679 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3680 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3681 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3682 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3683 PetscFunctionReturn(0); 3684 } 3685 3686 #if defined (PETSC_HAVE_SSE) 3687 3688 #include PETSC_HAVE_SSE 3689 3690 #undef __FUNCT__ 3691 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3692 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3693 { 3694 /* 3695 Note: This code uses demotion of double 3696 to float when performing the mixed-mode computation. 3697 This may not be numerically reasonable for all applications. 3698 */ 3699 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3700 IS iscol=a->col,isrow=a->row; 3701 PetscErrorCode ierr; 3702 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3703 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3704 MatScalar *aa=a->a,*v; 3705 PetscScalar *x,*b,*t; 3706 3707 /* Make space in temp stack for 16 Byte Aligned arrays */ 3708 float ssealignedspace[11],*tmps,*tmpx; 3709 unsigned long offset; 3710 3711 PetscFunctionBegin; 3712 SSE_SCOPE_BEGIN; 3713 3714 offset = (unsigned long)ssealignedspace % 16; 3715 if (offset) offset = (16 - offset)/4; 3716 tmps = &ssealignedspace[offset]; 3717 tmpx = &ssealignedspace[offset+4]; 3718 PREFETCH_NTA(aa+16*ai[1]); 3719 3720 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3721 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3722 t = a->solve_work; 3723 3724 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3725 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3726 3727 /* forward solve the lower triangular */ 3728 idx = 4*(*r++); 3729 t[0] = b[idx]; t[1] = b[1+idx]; 3730 t[2] = b[2+idx]; t[3] = b[3+idx]; 3731 v = aa + 16*ai[1]; 3732 3733 for (i=1; i<n;) { 3734 PREFETCH_NTA(&v[8]); 3735 vi = aj + ai[i]; 3736 nz = diag[i] - ai[i]; 3737 idx = 4*(*r++); 3738 3739 /* Demote sum from double to float */ 3740 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3741 LOAD_PS(tmps,XMM7); 3742 3743 while (nz--) { 3744 PREFETCH_NTA(&v[16]); 3745 idx = 4*(*vi++); 3746 3747 /* Demote solution (so far) from double to float */ 3748 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3749 3750 /* 4x4 Matrix-Vector product with negative accumulation: */ 3751 SSE_INLINE_BEGIN_2(tmpx,v) 3752 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3753 3754 /* First Column */ 3755 SSE_COPY_PS(XMM0,XMM6) 3756 SSE_SHUFFLE(XMM0,XMM0,0x00) 3757 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3758 SSE_SUB_PS(XMM7,XMM0) 3759 3760 /* Second Column */ 3761 SSE_COPY_PS(XMM1,XMM6) 3762 SSE_SHUFFLE(XMM1,XMM1,0x55) 3763 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3764 SSE_SUB_PS(XMM7,XMM1) 3765 3766 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3767 3768 /* Third Column */ 3769 SSE_COPY_PS(XMM2,XMM6) 3770 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3771 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3772 SSE_SUB_PS(XMM7,XMM2) 3773 3774 /* Fourth Column */ 3775 SSE_COPY_PS(XMM3,XMM6) 3776 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3777 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3778 SSE_SUB_PS(XMM7,XMM3) 3779 SSE_INLINE_END_2 3780 3781 v += 16; 3782 } 3783 idx = 4*i; 3784 v = aa + 16*ai[++i]; 3785 PREFETCH_NTA(v); 3786 STORE_PS(tmps,XMM7); 3787 3788 /* Promote result from float to double */ 3789 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3790 } 3791 /* backward solve the upper triangular */ 3792 idt = 4*(n-1); 3793 ai16 = 16*diag[n-1]; 3794 v = aa + ai16 + 16; 3795 for (i=n-1; i>=0;){ 3796 PREFETCH_NTA(&v[8]); 3797 vi = aj + diag[i] + 1; 3798 nz = ai[i+1] - diag[i] - 1; 3799 3800 /* Demote accumulator from double to float */ 3801 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3802 LOAD_PS(tmps,XMM7); 3803 3804 while (nz--) { 3805 PREFETCH_NTA(&v[16]); 3806 idx = 4*(*vi++); 3807 3808 /* Demote solution (so far) from double to float */ 3809 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 3810 3811 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3812 SSE_INLINE_BEGIN_2(tmpx,v) 3813 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3814 3815 /* First Column */ 3816 SSE_COPY_PS(XMM0,XMM6) 3817 SSE_SHUFFLE(XMM0,XMM0,0x00) 3818 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3819 SSE_SUB_PS(XMM7,XMM0) 3820 3821 /* Second Column */ 3822 SSE_COPY_PS(XMM1,XMM6) 3823 SSE_SHUFFLE(XMM1,XMM1,0x55) 3824 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3825 SSE_SUB_PS(XMM7,XMM1) 3826 3827 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3828 3829 /* Third Column */ 3830 SSE_COPY_PS(XMM2,XMM6) 3831 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3832 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3833 SSE_SUB_PS(XMM7,XMM2) 3834 3835 /* Fourth Column */ 3836 SSE_COPY_PS(XMM3,XMM6) 3837 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3838 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3839 SSE_SUB_PS(XMM7,XMM3) 3840 SSE_INLINE_END_2 3841 v += 16; 3842 } 3843 v = aa + ai16; 3844 ai16 = 16*diag[--i]; 3845 PREFETCH_NTA(aa+ai16+16); 3846 /* 3847 Scale the result by the diagonal 4x4 block, 3848 which was inverted as part of the factorization 3849 */ 3850 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 3851 /* First Column */ 3852 SSE_COPY_PS(XMM0,XMM7) 3853 SSE_SHUFFLE(XMM0,XMM0,0x00) 3854 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3855 3856 /* Second Column */ 3857 SSE_COPY_PS(XMM1,XMM7) 3858 SSE_SHUFFLE(XMM1,XMM1,0x55) 3859 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3860 SSE_ADD_PS(XMM0,XMM1) 3861 3862 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3863 3864 /* Third Column */ 3865 SSE_COPY_PS(XMM2,XMM7) 3866 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3867 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3868 SSE_ADD_PS(XMM0,XMM2) 3869 3870 /* Fourth Column */ 3871 SSE_COPY_PS(XMM3,XMM7) 3872 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3873 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3874 SSE_ADD_PS(XMM0,XMM3) 3875 3876 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3877 SSE_INLINE_END_3 3878 3879 /* Promote solution from float to double */ 3880 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 3881 3882 /* Apply reordering to t and stream into x. */ 3883 /* This way, x doesn't pollute the cache. */ 3884 /* Be careful with size: 2 doubles = 4 floats! */ 3885 idc = 4*(*c--); 3886 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 3887 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 3888 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 3889 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 3890 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 3891 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 3892 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 3893 SSE_INLINE_END_2 3894 v = aa + ai16 + 16; 3895 idt -= 4; 3896 } 3897 3898 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3899 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3900 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3901 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3902 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3903 SSE_SCOPE_END; 3904 PetscFunctionReturn(0); 3905 } 3906 3907 #endif 3908 3909 3910 /* 3911 Special case where the matrix was ILU(0) factored in the natural 3912 ordering. This eliminates the need for the column and row permutation. 3913 */ 3914 #undef __FUNCT__ 3915 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 3916 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3917 { 3918 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3919 PetscInt n=a->mbs; 3920 const PetscInt *ai=a->i,*aj=a->j; 3921 PetscErrorCode ierr; 3922 const PetscInt *diag = a->diag; 3923 const MatScalar *aa=a->a; 3924 PetscScalar *x; 3925 const PetscScalar *b; 3926 3927 PetscFunctionBegin; 3928 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3929 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3930 3931 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 3932 { 3933 static PetscScalar w[2000]; /* very BAD need to fix */ 3934 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 3935 } 3936 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 3937 { 3938 static PetscScalar w[2000]; /* very BAD need to fix */ 3939 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 3940 } 3941 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 3942 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3943 #else 3944 { 3945 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3946 const MatScalar *v; 3947 PetscInt jdx,idt,idx,nz,i,ai16; 3948 const PetscInt *vi; 3949 3950 /* forward solve the lower triangular */ 3951 idx = 0; 3952 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 3953 for (i=1; i<n; i++) { 3954 v = aa + 16*ai[i]; 3955 vi = aj + ai[i]; 3956 nz = diag[i] - ai[i]; 3957 idx += 4; 3958 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3959 while (nz--) { 3960 jdx = 4*(*vi++); 3961 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3962 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3963 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3964 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3965 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3966 v += 16; 3967 } 3968 x[idx] = s1; 3969 x[1+idx] = s2; 3970 x[2+idx] = s3; 3971 x[3+idx] = s4; 3972 } 3973 /* backward solve the upper triangular */ 3974 idt = 4*(n-1); 3975 for (i=n-1; i>=0; i--){ 3976 ai16 = 16*diag[i]; 3977 v = aa + ai16 + 16; 3978 vi = aj + diag[i] + 1; 3979 nz = ai[i+1] - diag[i] - 1; 3980 s1 = x[idt]; s2 = x[1+idt]; 3981 s3 = x[2+idt];s4 = x[3+idt]; 3982 while (nz--) { 3983 idx = 4*(*vi++); 3984 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3985 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3986 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3987 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3988 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3989 v += 16; 3990 } 3991 v = aa + ai16; 3992 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3993 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3994 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3995 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3996 idt -= 4; 3997 } 3998 } 3999 #endif 4000 4001 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4002 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4003 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4004 PetscFunctionReturn(0); 4005 } 4006 4007 #undef __FUNCT__ 4008 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 4009 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4010 { 4011 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4012 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4013 PetscErrorCode ierr; 4014 PetscInt idx,jdx,idt; 4015 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4016 const MatScalar *aa=a->a,*v; 4017 PetscScalar *x; 4018 const PetscScalar *b; 4019 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4020 4021 PetscFunctionBegin; 4022 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4023 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4024 /* forward solve the lower triangular */ 4025 idx = 0; 4026 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4027 for (i=1; i<n; i++) { 4028 v = aa + bs2*ai[i]; 4029 vi = aj + ai[i]; 4030 nz = ai[i+1] - ai[i]; 4031 idx = bs*i; 4032 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4033 for(k=0;k<nz;k++) { 4034 jdx = bs*vi[k]; 4035 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4036 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4037 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4038 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4039 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4040 4041 v += bs2; 4042 } 4043 4044 x[idx] = s1; 4045 x[1+idx] = s2; 4046 x[2+idx] = s3; 4047 x[3+idx] = s4; 4048 } 4049 4050 /* backward solve the upper triangular */ 4051 for (i=n-1; i>=0; i--){ 4052 v = aa + bs2*(adiag[i+1]+1); 4053 vi = aj + adiag[i+1]+1; 4054 nz = adiag[i] - adiag[i+1]-1; 4055 idt = bs*i; 4056 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4057 4058 for(k=0;k<nz;k++){ 4059 idx = bs*vi[k]; 4060 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4061 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4062 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4063 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4064 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4065 4066 v += bs2; 4067 } 4068 /* x = inv_diagonal*x */ 4069 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4070 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4071 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4072 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4073 4074 } 4075 4076 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4077 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4078 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4079 PetscFunctionReturn(0); 4080 } 4081 4082 #undef __FUNCT__ 4083 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4084 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4085 { 4086 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4087 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4088 PetscErrorCode ierr; 4089 PetscInt *diag = a->diag; 4090 MatScalar *aa=a->a; 4091 PetscScalar *x,*b; 4092 4093 PetscFunctionBegin; 4094 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4095 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4096 4097 { 4098 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4099 MatScalar *v,*t=(MatScalar *)x; 4100 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 4101 4102 /* forward solve the lower triangular */ 4103 idx = 0; 4104 t[0] = (MatScalar)b[0]; 4105 t[1] = (MatScalar)b[1]; 4106 t[2] = (MatScalar)b[2]; 4107 t[3] = (MatScalar)b[3]; 4108 for (i=1; i<n; i++) { 4109 v = aa + 16*ai[i]; 4110 vi = aj + ai[i]; 4111 nz = diag[i] - ai[i]; 4112 idx += 4; 4113 s1 = (MatScalar)b[idx]; 4114 s2 = (MatScalar)b[1+idx]; 4115 s3 = (MatScalar)b[2+idx]; 4116 s4 = (MatScalar)b[3+idx]; 4117 while (nz--) { 4118 jdx = 4*(*vi++); 4119 x1 = t[jdx]; 4120 x2 = t[1+jdx]; 4121 x3 = t[2+jdx]; 4122 x4 = t[3+jdx]; 4123 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4124 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4125 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4126 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4127 v += 16; 4128 } 4129 t[idx] = s1; 4130 t[1+idx] = s2; 4131 t[2+idx] = s3; 4132 t[3+idx] = s4; 4133 } 4134 /* backward solve the upper triangular */ 4135 idt = 4*(n-1); 4136 for (i=n-1; i>=0; i--){ 4137 ai16 = 16*diag[i]; 4138 v = aa + ai16 + 16; 4139 vi = aj + diag[i] + 1; 4140 nz = ai[i+1] - diag[i] - 1; 4141 s1 = t[idt]; 4142 s2 = t[1+idt]; 4143 s3 = t[2+idt]; 4144 s4 = t[3+idt]; 4145 while (nz--) { 4146 idx = 4*(*vi++); 4147 x1 = (MatScalar)x[idx]; 4148 x2 = (MatScalar)x[1+idx]; 4149 x3 = (MatScalar)x[2+idx]; 4150 x4 = (MatScalar)x[3+idx]; 4151 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4152 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4153 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4154 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4155 v += 16; 4156 } 4157 v = aa + ai16; 4158 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4159 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4160 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4161 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4162 idt -= 4; 4163 } 4164 } 4165 4166 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4168 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4169 PetscFunctionReturn(0); 4170 } 4171 4172 #if defined (PETSC_HAVE_SSE) 4173 4174 #include PETSC_HAVE_SSE 4175 #undef __FUNCT__ 4176 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4177 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 4178 { 4179 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4180 unsigned short *aj=(unsigned short *)a->j; 4181 PetscErrorCode ierr; 4182 int *ai=a->i,n=a->mbs,*diag = a->diag; 4183 MatScalar *aa=a->a; 4184 PetscScalar *x,*b; 4185 4186 PetscFunctionBegin; 4187 SSE_SCOPE_BEGIN; 4188 /* 4189 Note: This code currently uses demotion of double 4190 to float when performing the mixed-mode computation. 4191 This may not be numerically reasonable for all applications. 4192 */ 4193 PREFETCH_NTA(aa+16*ai[1]); 4194 4195 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4196 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4197 { 4198 /* x will first be computed in single precision then promoted inplace to double */ 4199 MatScalar *v,*t=(MatScalar *)x; 4200 int nz,i,idt,ai16; 4201 unsigned int jdx,idx; 4202 unsigned short *vi; 4203 /* Forward solve the lower triangular factor. */ 4204 4205 /* First block is the identity. */ 4206 idx = 0; 4207 CONVERT_DOUBLE4_FLOAT4(t,b); 4208 v = aa + 16*((unsigned int)ai[1]); 4209 4210 for (i=1; i<n;) { 4211 PREFETCH_NTA(&v[8]); 4212 vi = aj + ai[i]; 4213 nz = diag[i] - ai[i]; 4214 idx += 4; 4215 4216 /* Demote RHS from double to float. */ 4217 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4218 LOAD_PS(&t[idx],XMM7); 4219 4220 while (nz--) { 4221 PREFETCH_NTA(&v[16]); 4222 jdx = 4*((unsigned int)(*vi++)); 4223 4224 /* 4x4 Matrix-Vector product with negative accumulation: */ 4225 SSE_INLINE_BEGIN_2(&t[jdx],v) 4226 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4227 4228 /* First Column */ 4229 SSE_COPY_PS(XMM0,XMM6) 4230 SSE_SHUFFLE(XMM0,XMM0,0x00) 4231 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4232 SSE_SUB_PS(XMM7,XMM0) 4233 4234 /* Second Column */ 4235 SSE_COPY_PS(XMM1,XMM6) 4236 SSE_SHUFFLE(XMM1,XMM1,0x55) 4237 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4238 SSE_SUB_PS(XMM7,XMM1) 4239 4240 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4241 4242 /* Third Column */ 4243 SSE_COPY_PS(XMM2,XMM6) 4244 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4245 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4246 SSE_SUB_PS(XMM7,XMM2) 4247 4248 /* Fourth Column */ 4249 SSE_COPY_PS(XMM3,XMM6) 4250 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4251 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4252 SSE_SUB_PS(XMM7,XMM3) 4253 SSE_INLINE_END_2 4254 4255 v += 16; 4256 } 4257 v = aa + 16*ai[++i]; 4258 PREFETCH_NTA(v); 4259 STORE_PS(&t[idx],XMM7); 4260 } 4261 4262 /* Backward solve the upper triangular factor.*/ 4263 4264 idt = 4*(n-1); 4265 ai16 = 16*diag[n-1]; 4266 v = aa + ai16 + 16; 4267 for (i=n-1; i>=0;){ 4268 PREFETCH_NTA(&v[8]); 4269 vi = aj + diag[i] + 1; 4270 nz = ai[i+1] - diag[i] - 1; 4271 4272 LOAD_PS(&t[idt],XMM7); 4273 4274 while (nz--) { 4275 PREFETCH_NTA(&v[16]); 4276 idx = 4*((unsigned int)(*vi++)); 4277 4278 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4279 SSE_INLINE_BEGIN_2(&t[idx],v) 4280 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4281 4282 /* First Column */ 4283 SSE_COPY_PS(XMM0,XMM6) 4284 SSE_SHUFFLE(XMM0,XMM0,0x00) 4285 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4286 SSE_SUB_PS(XMM7,XMM0) 4287 4288 /* Second Column */ 4289 SSE_COPY_PS(XMM1,XMM6) 4290 SSE_SHUFFLE(XMM1,XMM1,0x55) 4291 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4292 SSE_SUB_PS(XMM7,XMM1) 4293 4294 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4295 4296 /* Third Column */ 4297 SSE_COPY_PS(XMM2,XMM6) 4298 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4299 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4300 SSE_SUB_PS(XMM7,XMM2) 4301 4302 /* Fourth Column */ 4303 SSE_COPY_PS(XMM3,XMM6) 4304 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4305 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4306 SSE_SUB_PS(XMM7,XMM3) 4307 SSE_INLINE_END_2 4308 v += 16; 4309 } 4310 v = aa + ai16; 4311 ai16 = 16*diag[--i]; 4312 PREFETCH_NTA(aa+ai16+16); 4313 /* 4314 Scale the result by the diagonal 4x4 block, 4315 which was inverted as part of the factorization 4316 */ 4317 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4318 /* First Column */ 4319 SSE_COPY_PS(XMM0,XMM7) 4320 SSE_SHUFFLE(XMM0,XMM0,0x00) 4321 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4322 4323 /* Second Column */ 4324 SSE_COPY_PS(XMM1,XMM7) 4325 SSE_SHUFFLE(XMM1,XMM1,0x55) 4326 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4327 SSE_ADD_PS(XMM0,XMM1) 4328 4329 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4330 4331 /* Third Column */ 4332 SSE_COPY_PS(XMM2,XMM7) 4333 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4334 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4335 SSE_ADD_PS(XMM0,XMM2) 4336 4337 /* Fourth Column */ 4338 SSE_COPY_PS(XMM3,XMM7) 4339 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4340 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4341 SSE_ADD_PS(XMM0,XMM3) 4342 4343 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4344 SSE_INLINE_END_3 4345 4346 v = aa + ai16 + 16; 4347 idt -= 4; 4348 } 4349 4350 /* Convert t from single precision back to double precision (inplace)*/ 4351 idt = 4*(n-1); 4352 for (i=n-1;i>=0;i--) { 4353 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4354 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4355 PetscScalar *xtemp=&x[idt]; 4356 MatScalar *ttemp=&t[idt]; 4357 xtemp[3] = (PetscScalar)ttemp[3]; 4358 xtemp[2] = (PetscScalar)ttemp[2]; 4359 xtemp[1] = (PetscScalar)ttemp[1]; 4360 xtemp[0] = (PetscScalar)ttemp[0]; 4361 idt -= 4; 4362 } 4363 4364 } /* End of artificial scope. */ 4365 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4366 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4367 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4368 SSE_SCOPE_END; 4369 PetscFunctionReturn(0); 4370 } 4371 4372 #undef __FUNCT__ 4373 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4374 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 4375 { 4376 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4377 int *aj=a->j; 4378 PetscErrorCode ierr; 4379 int *ai=a->i,n=a->mbs,*diag = a->diag; 4380 MatScalar *aa=a->a; 4381 PetscScalar *x,*b; 4382 4383 PetscFunctionBegin; 4384 SSE_SCOPE_BEGIN; 4385 /* 4386 Note: This code currently uses demotion of double 4387 to float when performing the mixed-mode computation. 4388 This may not be numerically reasonable for all applications. 4389 */ 4390 PREFETCH_NTA(aa+16*ai[1]); 4391 4392 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4393 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4394 { 4395 /* x will first be computed in single precision then promoted inplace to double */ 4396 MatScalar *v,*t=(MatScalar *)x; 4397 int nz,i,idt,ai16; 4398 int jdx,idx; 4399 int *vi; 4400 /* Forward solve the lower triangular factor. */ 4401 4402 /* First block is the identity. */ 4403 idx = 0; 4404 CONVERT_DOUBLE4_FLOAT4(t,b); 4405 v = aa + 16*ai[1]; 4406 4407 for (i=1; i<n;) { 4408 PREFETCH_NTA(&v[8]); 4409 vi = aj + ai[i]; 4410 nz = diag[i] - ai[i]; 4411 idx += 4; 4412 4413 /* Demote RHS from double to float. */ 4414 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4415 LOAD_PS(&t[idx],XMM7); 4416 4417 while (nz--) { 4418 PREFETCH_NTA(&v[16]); 4419 jdx = 4*(*vi++); 4420 /* jdx = *vi++; */ 4421 4422 /* 4x4 Matrix-Vector product with negative accumulation: */ 4423 SSE_INLINE_BEGIN_2(&t[jdx],v) 4424 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4425 4426 /* First Column */ 4427 SSE_COPY_PS(XMM0,XMM6) 4428 SSE_SHUFFLE(XMM0,XMM0,0x00) 4429 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4430 SSE_SUB_PS(XMM7,XMM0) 4431 4432 /* Second Column */ 4433 SSE_COPY_PS(XMM1,XMM6) 4434 SSE_SHUFFLE(XMM1,XMM1,0x55) 4435 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4436 SSE_SUB_PS(XMM7,XMM1) 4437 4438 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4439 4440 /* Third Column */ 4441 SSE_COPY_PS(XMM2,XMM6) 4442 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4443 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4444 SSE_SUB_PS(XMM7,XMM2) 4445 4446 /* Fourth Column */ 4447 SSE_COPY_PS(XMM3,XMM6) 4448 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4449 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4450 SSE_SUB_PS(XMM7,XMM3) 4451 SSE_INLINE_END_2 4452 4453 v += 16; 4454 } 4455 v = aa + 16*ai[++i]; 4456 PREFETCH_NTA(v); 4457 STORE_PS(&t[idx],XMM7); 4458 } 4459 4460 /* Backward solve the upper triangular factor.*/ 4461 4462 idt = 4*(n-1); 4463 ai16 = 16*diag[n-1]; 4464 v = aa + ai16 + 16; 4465 for (i=n-1; i>=0;){ 4466 PREFETCH_NTA(&v[8]); 4467 vi = aj + diag[i] + 1; 4468 nz = ai[i+1] - diag[i] - 1; 4469 4470 LOAD_PS(&t[idt],XMM7); 4471 4472 while (nz--) { 4473 PREFETCH_NTA(&v[16]); 4474 idx = 4*(*vi++); 4475 /* idx = *vi++; */ 4476 4477 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4478 SSE_INLINE_BEGIN_2(&t[idx],v) 4479 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4480 4481 /* First Column */ 4482 SSE_COPY_PS(XMM0,XMM6) 4483 SSE_SHUFFLE(XMM0,XMM0,0x00) 4484 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4485 SSE_SUB_PS(XMM7,XMM0) 4486 4487 /* Second Column */ 4488 SSE_COPY_PS(XMM1,XMM6) 4489 SSE_SHUFFLE(XMM1,XMM1,0x55) 4490 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4491 SSE_SUB_PS(XMM7,XMM1) 4492 4493 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4494 4495 /* Third Column */ 4496 SSE_COPY_PS(XMM2,XMM6) 4497 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4498 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4499 SSE_SUB_PS(XMM7,XMM2) 4500 4501 /* Fourth Column */ 4502 SSE_COPY_PS(XMM3,XMM6) 4503 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4504 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4505 SSE_SUB_PS(XMM7,XMM3) 4506 SSE_INLINE_END_2 4507 v += 16; 4508 } 4509 v = aa + ai16; 4510 ai16 = 16*diag[--i]; 4511 PREFETCH_NTA(aa+ai16+16); 4512 /* 4513 Scale the result by the diagonal 4x4 block, 4514 which was inverted as part of the factorization 4515 */ 4516 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4517 /* First Column */ 4518 SSE_COPY_PS(XMM0,XMM7) 4519 SSE_SHUFFLE(XMM0,XMM0,0x00) 4520 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4521 4522 /* Second Column */ 4523 SSE_COPY_PS(XMM1,XMM7) 4524 SSE_SHUFFLE(XMM1,XMM1,0x55) 4525 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4526 SSE_ADD_PS(XMM0,XMM1) 4527 4528 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4529 4530 /* Third Column */ 4531 SSE_COPY_PS(XMM2,XMM7) 4532 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4533 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4534 SSE_ADD_PS(XMM0,XMM2) 4535 4536 /* Fourth Column */ 4537 SSE_COPY_PS(XMM3,XMM7) 4538 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4539 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4540 SSE_ADD_PS(XMM0,XMM3) 4541 4542 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4543 SSE_INLINE_END_3 4544 4545 v = aa + ai16 + 16; 4546 idt -= 4; 4547 } 4548 4549 /* Convert t from single precision back to double precision (inplace)*/ 4550 idt = 4*(n-1); 4551 for (i=n-1;i>=0;i--) { 4552 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4553 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4554 PetscScalar *xtemp=&x[idt]; 4555 MatScalar *ttemp=&t[idt]; 4556 xtemp[3] = (PetscScalar)ttemp[3]; 4557 xtemp[2] = (PetscScalar)ttemp[2]; 4558 xtemp[1] = (PetscScalar)ttemp[1]; 4559 xtemp[0] = (PetscScalar)ttemp[0]; 4560 idt -= 4; 4561 } 4562 4563 } /* End of artificial scope. */ 4564 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4565 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4566 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4567 SSE_SCOPE_END; 4568 PetscFunctionReturn(0); 4569 } 4570 4571 #endif 4572 4573 #undef __FUNCT__ 4574 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 4575 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 4576 { 4577 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4578 IS iscol=a->col,isrow=a->row; 4579 PetscErrorCode ierr; 4580 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4581 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4582 const MatScalar *aa=a->a,*v; 4583 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4584 const PetscScalar *b; 4585 4586 PetscFunctionBegin; 4587 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4588 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4589 t = a->solve_work; 4590 4591 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4592 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4593 4594 /* forward solve the lower triangular */ 4595 idx = 3*(*r++); 4596 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4597 for (i=1; i<n; i++) { 4598 v = aa + 9*ai[i]; 4599 vi = aj + ai[i]; 4600 nz = diag[i] - ai[i]; 4601 idx = 3*(*r++); 4602 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4603 while (nz--) { 4604 idx = 3*(*vi++); 4605 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4606 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4607 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4608 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4609 v += 9; 4610 } 4611 idx = 3*i; 4612 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4613 } 4614 /* backward solve the upper triangular */ 4615 for (i=n-1; i>=0; i--){ 4616 v = aa + 9*diag[i] + 9; 4617 vi = aj + diag[i] + 1; 4618 nz = ai[i+1] - diag[i] - 1; 4619 idt = 3*i; 4620 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4621 while (nz--) { 4622 idx = 3*(*vi++); 4623 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4624 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4625 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4626 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4627 v += 9; 4628 } 4629 idc = 3*(*c--); 4630 v = aa + 9*diag[i]; 4631 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4632 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4633 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4634 } 4635 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4636 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4637 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4638 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4639 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4640 PetscFunctionReturn(0); 4641 } 4642 4643 #undef __FUNCT__ 4644 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4645 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 4646 { 4647 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4648 IS iscol=a->col,isrow=a->row; 4649 PetscErrorCode ierr; 4650 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 4651 const PetscInt *r,*c,*rout,*cout; 4652 const MatScalar *aa=a->a,*v; 4653 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4654 const PetscScalar *b; 4655 4656 PetscFunctionBegin; 4657 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4658 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4659 t = a->solve_work; 4660 4661 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4662 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4663 4664 /* forward solve the lower triangular */ 4665 idx = 3*r[0]; 4666 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4667 for (i=1; i<n; i++) { 4668 v = aa + 9*ai[i]; 4669 vi = aj + ai[i]; 4670 nz = ai[i+1] - ai[i]; 4671 idx = 3*r[i]; 4672 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4673 for(m=0;m<nz;m++){ 4674 idx = 3*vi[m]; 4675 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4676 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4677 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4678 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4679 v += 9; 4680 } 4681 idx = 3*i; 4682 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4683 } 4684 /* backward solve the upper triangular */ 4685 for (i=n-1; i>=0; i--){ 4686 v = aa + 9*(adiag[i+1]+1); 4687 vi = aj + adiag[i+1]+1; 4688 nz = adiag[i] - adiag[i+1] - 1; 4689 idt = 3*i; 4690 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4691 for(m=0;m<nz;m++){ 4692 idx = 3*vi[m]; 4693 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4694 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4695 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4696 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4697 v += 9; 4698 } 4699 idc = 3*c[i]; 4700 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4701 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4702 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4703 } 4704 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4705 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4706 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4707 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4708 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4709 PetscFunctionReturn(0); 4710 } 4711 4712 /* 4713 Special case where the matrix was ILU(0) factored in the natural 4714 ordering. This eliminates the need for the column and row permutation. 4715 */ 4716 #undef __FUNCT__ 4717 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 4718 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4719 { 4720 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4721 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4722 PetscErrorCode ierr; 4723 PetscInt *diag = a->diag; 4724 const MatScalar *aa=a->a,*v; 4725 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4726 const PetscScalar *b; 4727 PetscInt jdx,idt,idx,nz,*vi,i; 4728 4729 PetscFunctionBegin; 4730 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4731 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4732 4733 /* forward solve the lower triangular */ 4734 idx = 0; 4735 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4736 for (i=1; i<n; i++) { 4737 v = aa + 9*ai[i]; 4738 vi = aj + ai[i]; 4739 nz = diag[i] - ai[i]; 4740 idx += 3; 4741 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4742 while (nz--) { 4743 jdx = 3*(*vi++); 4744 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4745 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4746 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4747 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4748 v += 9; 4749 } 4750 x[idx] = s1; 4751 x[1+idx] = s2; 4752 x[2+idx] = s3; 4753 } 4754 /* backward solve the upper triangular */ 4755 for (i=n-1; i>=0; i--){ 4756 v = aa + 9*diag[i] + 9; 4757 vi = aj + diag[i] + 1; 4758 nz = ai[i+1] - diag[i] - 1; 4759 idt = 3*i; 4760 s1 = x[idt]; s2 = x[1+idt]; 4761 s3 = x[2+idt]; 4762 while (nz--) { 4763 idx = 3*(*vi++); 4764 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4765 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4766 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4767 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4768 v += 9; 4769 } 4770 v = aa + 9*diag[i]; 4771 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4772 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4773 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4774 } 4775 4776 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4777 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4778 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4779 PetscFunctionReturn(0); 4780 } 4781 4782 #undef __FUNCT__ 4783 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4784 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4785 { 4786 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4787 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4788 PetscErrorCode ierr; 4789 PetscInt idx,jdx,idt; 4790 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4791 const MatScalar *aa=a->a,*v; 4792 PetscScalar *x; 4793 const PetscScalar *b; 4794 PetscScalar s1,s2,s3,x1,x2,x3; 4795 4796 PetscFunctionBegin; 4797 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4798 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4799 /* forward solve the lower triangular */ 4800 idx = 0; 4801 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4802 for (i=1; i<n; i++) { 4803 v = aa + bs2*ai[i]; 4804 vi = aj + ai[i]; 4805 nz = ai[i+1] - ai[i]; 4806 idx = bs*i; 4807 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4808 for(k=0;k<nz;k++){ 4809 jdx = bs*vi[k]; 4810 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4811 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4812 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4813 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4814 4815 v += bs2; 4816 } 4817 4818 x[idx] = s1; 4819 x[1+idx] = s2; 4820 x[2+idx] = s3; 4821 } 4822 4823 /* backward solve the upper triangular */ 4824 for (i=n-1; i>=0; i--){ 4825 v = aa + bs2*(adiag[i+1]+1); 4826 vi = aj + adiag[i+1]+1; 4827 nz = adiag[i] - adiag[i+1]-1; 4828 idt = bs*i; 4829 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4830 4831 for(k=0;k<nz;k++){ 4832 idx = bs*vi[k]; 4833 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4834 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4835 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4836 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4837 4838 v += bs2; 4839 } 4840 /* x = inv_diagonal*x */ 4841 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4842 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4843 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4844 4845 } 4846 4847 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4848 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4849 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4850 PetscFunctionReturn(0); 4851 } 4852 4853 #undef __FUNCT__ 4854 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 4855 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 4856 { 4857 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4858 IS iscol=a->col,isrow=a->row; 4859 PetscErrorCode ierr; 4860 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4861 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4862 const MatScalar *aa=a->a,*v; 4863 PetscScalar *x,s1,s2,x1,x2,*t; 4864 const PetscScalar *b; 4865 4866 PetscFunctionBegin; 4867 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4868 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4869 t = a->solve_work; 4870 4871 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4872 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4873 4874 /* forward solve the lower triangular */ 4875 idx = 2*(*r++); 4876 t[0] = b[idx]; t[1] = b[1+idx]; 4877 for (i=1; i<n; i++) { 4878 v = aa + 4*ai[i]; 4879 vi = aj + ai[i]; 4880 nz = diag[i] - ai[i]; 4881 idx = 2*(*r++); 4882 s1 = b[idx]; s2 = b[1+idx]; 4883 while (nz--) { 4884 idx = 2*(*vi++); 4885 x1 = t[idx]; x2 = t[1+idx]; 4886 s1 -= v[0]*x1 + v[2]*x2; 4887 s2 -= v[1]*x1 + v[3]*x2; 4888 v += 4; 4889 } 4890 idx = 2*i; 4891 t[idx] = s1; t[1+idx] = s2; 4892 } 4893 /* backward solve the upper triangular */ 4894 for (i=n-1; i>=0; i--){ 4895 v = aa + 4*diag[i] + 4; 4896 vi = aj + diag[i] + 1; 4897 nz = ai[i+1] - diag[i] - 1; 4898 idt = 2*i; 4899 s1 = t[idt]; s2 = t[1+idt]; 4900 while (nz--) { 4901 idx = 2*(*vi++); 4902 x1 = t[idx]; x2 = t[1+idx]; 4903 s1 -= v[0]*x1 + v[2]*x2; 4904 s2 -= v[1]*x1 + v[3]*x2; 4905 v += 4; 4906 } 4907 idc = 2*(*c--); 4908 v = aa + 4*diag[i]; 4909 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4910 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4911 } 4912 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4913 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4914 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4915 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4916 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4917 PetscFunctionReturn(0); 4918 } 4919 4920 #undef __FUNCT__ 4921 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4922 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 4923 { 4924 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4925 IS iscol=a->col,isrow=a->row; 4926 PetscErrorCode ierr; 4927 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 4928 const PetscInt *r,*c,*rout,*cout; 4929 const MatScalar *aa=a->a,*v; 4930 PetscScalar *x,s1,s2,x1,x2,*t; 4931 const PetscScalar *b; 4932 4933 PetscFunctionBegin; 4934 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4935 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4936 t = a->solve_work; 4937 4938 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4939 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4940 4941 /* forward solve the lower triangular */ 4942 idx = 2*r[0]; 4943 t[0] = b[idx]; t[1] = b[1+idx]; 4944 for (i=1; i<n; i++) { 4945 v = aa + 4*ai[i]; 4946 vi = aj + ai[i]; 4947 nz = ai[i+1] - ai[i]; 4948 idx = 2*r[i]; 4949 s1 = b[idx]; s2 = b[1+idx]; 4950 for(m=0;m<nz;m++){ 4951 jdx = 2*vi[m]; 4952 x1 = t[jdx]; x2 = t[1+jdx]; 4953 s1 -= v[0]*x1 + v[2]*x2; 4954 s2 -= v[1]*x1 + v[3]*x2; 4955 v += 4; 4956 } 4957 idx = 2*i; 4958 t[idx] = s1; t[1+idx] = s2; 4959 } 4960 /* backward solve the upper triangular */ 4961 for (i=n-1; i>=0; i--){ 4962 v = aa + 4*(adiag[i+1]+1); 4963 vi = aj + adiag[i+1]+1; 4964 nz = adiag[i] - adiag[i+1] - 1; 4965 idt = 2*i; 4966 s1 = t[idt]; s2 = t[1+idt]; 4967 for(m=0;m<nz;m++){ 4968 idx = 2*vi[m]; 4969 x1 = t[idx]; x2 = t[1+idx]; 4970 s1 -= v[0]*x1 + v[2]*x2; 4971 s2 -= v[1]*x1 + v[3]*x2; 4972 v += 4; 4973 } 4974 idc = 2*c[i]; 4975 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4976 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4977 } 4978 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4979 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4980 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4981 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4982 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4983 PetscFunctionReturn(0); 4984 } 4985 4986 /* 4987 Special case where the matrix was ILU(0) factored in the natural 4988 ordering. This eliminates the need for the column and row permutation. 4989 */ 4990 #undef __FUNCT__ 4991 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 4992 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4993 { 4994 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4995 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4996 PetscErrorCode ierr; 4997 PetscInt *diag = a->diag; 4998 const MatScalar *aa=a->a,*v; 4999 PetscScalar *x,s1,s2,x1,x2; 5000 const PetscScalar *b; 5001 PetscInt jdx,idt,idx,nz,*vi,i; 5002 5003 PetscFunctionBegin; 5004 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5005 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5006 5007 /* forward solve the lower triangular */ 5008 idx = 0; 5009 x[0] = b[0]; x[1] = b[1]; 5010 for (i=1; i<n; i++) { 5011 v = aa + 4*ai[i]; 5012 vi = aj + ai[i]; 5013 nz = diag[i] - ai[i]; 5014 idx += 2; 5015 s1 = b[idx];s2 = b[1+idx]; 5016 while (nz--) { 5017 jdx = 2*(*vi++); 5018 x1 = x[jdx];x2 = x[1+jdx]; 5019 s1 -= v[0]*x1 + v[2]*x2; 5020 s2 -= v[1]*x1 + v[3]*x2; 5021 v += 4; 5022 } 5023 x[idx] = s1; 5024 x[1+idx] = s2; 5025 } 5026 /* backward solve the upper triangular */ 5027 for (i=n-1; i>=0; i--){ 5028 v = aa + 4*diag[i] + 4; 5029 vi = aj + diag[i] + 1; 5030 nz = ai[i+1] - diag[i] - 1; 5031 idt = 2*i; 5032 s1 = x[idt]; s2 = x[1+idt]; 5033 while (nz--) { 5034 idx = 2*(*vi++); 5035 x1 = x[idx]; x2 = x[1+idx]; 5036 s1 -= v[0]*x1 + v[2]*x2; 5037 s2 -= v[1]*x1 + v[3]*x2; 5038 v += 4; 5039 } 5040 v = aa + 4*diag[i]; 5041 x[idt] = v[0]*s1 + v[2]*s2; 5042 x[1+idt] = v[1]*s1 + v[3]*s2; 5043 } 5044 5045 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5046 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5047 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5048 PetscFunctionReturn(0); 5049 } 5050 5051 #undef __FUNCT__ 5052 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 5053 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5054 { 5055 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5056 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 5057 PetscErrorCode ierr; 5058 PetscInt jdx; 5059 const MatScalar *aa=a->a,*v; 5060 PetscScalar *x,s1,s2,x1,x2; 5061 const PetscScalar *b; 5062 5063 PetscFunctionBegin; 5064 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5065 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5066 /* forward solve the lower triangular */ 5067 idx = 0; 5068 x[0] = b[idx]; x[1] = b[1+idx]; 5069 for (i=1; i<n; i++) { 5070 v = aa + 4*ai[i]; 5071 vi = aj + ai[i]; 5072 nz = ai[i+1] - ai[i]; 5073 idx = 2*i; 5074 s1 = b[idx];s2 = b[1+idx]; 5075 for(k=0;k<nz;k++){ 5076 jdx = 2*vi[k]; 5077 x1 = x[jdx];x2 = x[1+jdx]; 5078 s1 -= v[0]*x1 + v[2]*x2; 5079 s2 -= v[1]*x1 + v[3]*x2; 5080 v += 4; 5081 } 5082 x[idx] = s1; 5083 x[1+idx] = s2; 5084 } 5085 5086 /* backward solve the upper triangular */ 5087 for (i=n-1; i>=0; i--){ 5088 v = aa + 4*(adiag[i+1]+1); 5089 vi = aj + adiag[i+1]+1; 5090 nz = adiag[i] - adiag[i+1]-1; 5091 idt = 2*i; 5092 s1 = x[idt]; s2 = x[1+idt]; 5093 for(k=0;k<nz;k++){ 5094 idx = 2*vi[k]; 5095 x1 = x[idx]; x2 = x[1+idx]; 5096 s1 -= v[0]*x1 + v[2]*x2; 5097 s2 -= v[1]*x1 + v[3]*x2; 5098 v += 4; 5099 } 5100 /* x = inv_diagonal*x */ 5101 x[idt] = v[0]*s1 + v[2]*s2; 5102 x[1+idt] = v[1]*s1 + v[3]*s2; 5103 } 5104 5105 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5106 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5107 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5108 PetscFunctionReturn(0); 5109 } 5110 5111 #undef __FUNCT__ 5112 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 5113 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 5114 { 5115 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5116 IS iscol=a->col,isrow=a->row; 5117 PetscErrorCode ierr; 5118 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 5119 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5120 MatScalar *aa=a->a,*v; 5121 PetscScalar *x,*b,s1,*t; 5122 5123 PetscFunctionBegin; 5124 if (!n) PetscFunctionReturn(0); 5125 5126 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5127 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5128 t = a->solve_work; 5129 5130 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5131 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5132 5133 /* forward solve the lower triangular */ 5134 t[0] = b[*r++]; 5135 for (i=1; i<n; i++) { 5136 v = aa + ai[i]; 5137 vi = aj + ai[i]; 5138 nz = diag[i] - ai[i]; 5139 s1 = b[*r++]; 5140 while (nz--) { 5141 s1 -= (*v++)*t[*vi++]; 5142 } 5143 t[i] = s1; 5144 } 5145 /* backward solve the upper triangular */ 5146 for (i=n-1; i>=0; i--){ 5147 v = aa + diag[i] + 1; 5148 vi = aj + diag[i] + 1; 5149 nz = ai[i+1] - diag[i] - 1; 5150 s1 = t[i]; 5151 while (nz--) { 5152 s1 -= (*v++)*t[*vi++]; 5153 } 5154 x[*c--] = t[i] = aa[diag[i]]*s1; 5155 } 5156 5157 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5158 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5159 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5160 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5161 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5162 PetscFunctionReturn(0); 5163 } 5164 /* 5165 Special case where the matrix was ILU(0) factored in the natural 5166 ordering. This eliminates the need for the column and row permutation. 5167 */ 5168 #undef __FUNCT__ 5169 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 5170 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5171 { 5172 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5173 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 5174 PetscErrorCode ierr; 5175 PetscInt *diag = a->diag; 5176 MatScalar *aa=a->a; 5177 PetscScalar *x,*b; 5178 PetscScalar s1,x1; 5179 MatScalar *v; 5180 PetscInt jdx,idt,idx,nz,*vi,i; 5181 5182 PetscFunctionBegin; 5183 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5184 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5185 5186 /* forward solve the lower triangular */ 5187 idx = 0; 5188 x[0] = b[0]; 5189 for (i=1; i<n; i++) { 5190 v = aa + ai[i]; 5191 vi = aj + ai[i]; 5192 nz = diag[i] - ai[i]; 5193 idx += 1; 5194 s1 = b[idx]; 5195 while (nz--) { 5196 jdx = *vi++; 5197 x1 = x[jdx]; 5198 s1 -= v[0]*x1; 5199 v += 1; 5200 } 5201 x[idx] = s1; 5202 } 5203 /* backward solve the upper triangular */ 5204 for (i=n-1; i>=0; i--){ 5205 v = aa + diag[i] + 1; 5206 vi = aj + diag[i] + 1; 5207 nz = ai[i+1] - diag[i] - 1; 5208 idt = i; 5209 s1 = x[idt]; 5210 while (nz--) { 5211 idx = *vi++; 5212 x1 = x[idx]; 5213 s1 -= v[0]*x1; 5214 v += 1; 5215 } 5216 v = aa + diag[i]; 5217 x[idt] = v[0]*s1; 5218 } 5219 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5220 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5221 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5222 PetscFunctionReturn(0); 5223 } 5224 5225 /* ----------------------------------------------------------------*/ 5226 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 5227 5228 /* bs = 15 for PFLOTRAN */ 5229 #undef __FUNCT__ 5230 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering" 5231 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 5232 { 5233 Mat C=B; 5234 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5235 PetscErrorCode ierr; 5236 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5237 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5238 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5239 PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 5240 PetscReal shift = info->shiftinblocks; 5241 5242 PetscFunctionBegin; 5243 5244 /* generate work space needed by the factorization */ 5245 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 5246 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5247 5248 for (i=0; i<n; i++){ 5249 /* zero rtmp */ 5250 /* L part */ 5251 nz = bi[i+1] - bi[i]; 5252 bjtmp = bj + bi[i]; 5253 for (j=0; j<nz; j++){ 5254 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5255 } 5256 5257 /* U part */ 5258 nz = bdiag[i] - bdiag[i+1]; 5259 bjtmp = bj + bdiag[i+1]+1; 5260 for (j=0; j<nz; j++){ 5261 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5262 } 5263 5264 /* load in initial (unfactored row) */ 5265 nz = ai[i+1] - ai[i]; 5266 ajtmp = aj + ai[i]; 5267 v = aa + bs2*ai[i]; 5268 for (j=0; j<nz; j++) { 5269 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5270 } 5271 5272 /* elimination */ 5273 bjtmp = bj + bi[i]; 5274 nzL = bi[i+1] - bi[i]; 5275 for(k=0;k < nzL;k++) { 5276 row = bjtmp[k]; 5277 pc = rtmp + bs2*row; 5278 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5279 if (flg) { 5280 pv = b->a + bs2*bdiag[row]; 5281 ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr); 5282 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5283 pv = b->a + bs2*(bdiag[row+1]+1); 5284 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5285 for (j=0; j<nz; j++) { 5286 v = rtmp + bs2*pj[j]; 5287 ierr = Kernel_A_gets_A_minus_B_times_C_15(v,pc,pv);CHKERRQ(ierr); 5288 pv += bs2; 5289 } 5290 ierr = PetscLogFlops(2*bs2*bs*nz+2*bs2*bs-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5291 } 5292 } 5293 5294 /* finished row so stick it into b->a */ 5295 /* L part */ 5296 pv = b->a + bs2*bi[i] ; 5297 pj = b->j + bi[i] ; 5298 nz = bi[i+1] - bi[i]; 5299 for (j=0; j<nz; j++) { 5300 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5301 } 5302 5303 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5304 pv = b->a + bs2*bdiag[i]; 5305 pj = b->j + bdiag[i]; 5306 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5307 ierr = Kernel_A_gets_inverse_A_15(pv,shift);CHKERRQ(ierr); 5308 5309 /* U part */ 5310 pv = b->a + bs2*(bdiag[i+1]+1); 5311 pj = b->j + bdiag[i+1]+1; 5312 nz = bdiag[i] - bdiag[i+1] - 1; 5313 for (j=0; j<nz; j++){ 5314 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5315 } 5316 } 5317 5318 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5319 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering; 5320 C->ops->solvetranspose = 0; 5321 C->assembled = PETSC_TRUE; 5322 ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5323 PetscFunctionReturn(0); 5324 } 5325 5326 #undef __FUNCT__ 5327 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 5328 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 5329 { 5330 Mat C=B; 5331 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5332 IS isrow = b->row,isicol = b->icol; 5333 PetscErrorCode ierr; 5334 const PetscInt *r,*ic,*ics; 5335 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5336 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5337 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5338 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5339 MatScalar *v_work; 5340 PetscTruth col_identity,row_identity,both_identity; 5341 5342 PetscFunctionBegin; 5343 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5344 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5345 5346 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5347 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5348 ics = ic; 5349 5350 /* generate work space needed by dense LU factorization */ 5351 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5352 5353 for (i=0; i<n; i++){ 5354 /* zero rtmp */ 5355 /* L part */ 5356 nz = bi[i+1] - bi[i]; 5357 bjtmp = bj + bi[i]; 5358 for (j=0; j<nz; j++){ 5359 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5360 } 5361 5362 /* U part */ 5363 nz = bdiag[i] - bdiag[i+1]; 5364 bjtmp = bj + bdiag[i+1]+1; 5365 for (j=0; j<nz; j++){ 5366 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5367 } 5368 5369 /* load in initial (unfactored row) */ 5370 nz = ai[r[i]+1] - ai[r[i]]; 5371 ajtmp = aj + ai[r[i]]; 5372 v = aa + bs2*ai[r[i]]; 5373 for (j=0; j<nz; j++) { 5374 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5375 } 5376 5377 /* elimination */ 5378 bjtmp = bj + bi[i]; 5379 nzL = bi[i+1] - bi[i]; 5380 for(k=0;k < nzL;k++) { 5381 row = bjtmp[k]; 5382 pc = rtmp + bs2*row; 5383 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5384 if (flg) { 5385 pv = b->a + bs2*bdiag[row]; 5386 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5387 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5388 pv = b->a + bs2*(bdiag[row+1]+1); 5389 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5390 for (j=0; j<nz; j++) { 5391 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5392 } 5393 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5394 } 5395 } 5396 5397 /* finished row so stick it into b->a */ 5398 /* L part */ 5399 pv = b->a + bs2*bi[i] ; 5400 pj = b->j + bi[i] ; 5401 nz = bi[i+1] - bi[i]; 5402 for (j=0; j<nz; j++) { 5403 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5404 } 5405 5406 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5407 pv = b->a + bs2*bdiag[i]; 5408 pj = b->j + bdiag[i]; 5409 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5410 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5411 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5412 5413 /* U part */ 5414 pv = b->a + bs2*(bdiag[i+1]+1); 5415 pj = b->j + bdiag[i+1]+1; 5416 nz = bdiag[i] - bdiag[i+1] - 1; 5417 for (j=0; j<nz; j++){ 5418 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5419 } 5420 } 5421 5422 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5423 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 5424 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5425 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5426 5427 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5428 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5429 both_identity = (PetscTruth) (row_identity && col_identity); 5430 if (both_identity){ 5431 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5432 } else { 5433 C->ops->solve = MatSolve_SeqBAIJ_N; 5434 } 5435 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5436 5437 C->assembled = PETSC_TRUE; 5438 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5439 PetscFunctionReturn(0); 5440 } 5441 5442 /* 5443 ilu(0) with natural ordering under new data structure. 5444 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 5445 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 5446 */ 5447 5448 #undef __FUNCT__ 5449 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 5450 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5451 { 5452 5453 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5454 PetscErrorCode ierr; 5455 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5456 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 5457 5458 PetscFunctionBegin; 5459 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5460 b = (Mat_SeqBAIJ*)(fact)->data; 5461 5462 /* allocate matrix arrays for new data structure */ 5463 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 5464 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5465 b->singlemalloc = PETSC_TRUE; 5466 if (!b->diag){ 5467 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5468 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5469 } 5470 bdiag = b->diag; 5471 5472 if (n > 0) { 5473 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5474 } 5475 5476 /* set bi and bj with new data structure */ 5477 bi = b->i; 5478 bj = b->j; 5479 5480 /* L part */ 5481 bi[0] = 0; 5482 for (i=0; i<n; i++){ 5483 nz = adiag[i] - ai[i]; 5484 bi[i+1] = bi[i] + nz; 5485 aj = a->j + ai[i]; 5486 for (j=0; j<nz; j++){ 5487 *bj = aj[j]; bj++; 5488 } 5489 } 5490 5491 /* U part */ 5492 bi_temp = bi[n]; 5493 bdiag[n] = bi[n]-1; 5494 for (i=n-1; i>=0; i--){ 5495 nz = ai[i+1] - adiag[i] - 1; 5496 bi_temp = bi_temp + nz + 1; 5497 aj = a->j + adiag[i] + 1; 5498 for (j=0; j<nz; j++){ 5499 *bj = aj[j]; bj++; 5500 } 5501 /* diag[i] */ 5502 *bj = i; bj++; 5503 bdiag[i] = bi_temp - 1; 5504 } 5505 PetscFunctionReturn(0); 5506 } 5507 5508 #undef __FUNCT__ 5509 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5510 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5511 { 5512 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5513 IS isicol; 5514 PetscErrorCode ierr; 5515 const PetscInt *r,*ic; 5516 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5517 PetscInt *bi,*cols,nnz,*cols_lvl; 5518 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5519 PetscInt i,levels,diagonal_fill; 5520 PetscTruth col_identity,row_identity,both_identity; 5521 PetscReal f; 5522 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5523 PetscBT lnkbt; 5524 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5525 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5526 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5527 PetscTruth missing; 5528 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5529 PetscTruth olddatastruct = PETSC_FALSE; 5530 5531 PetscFunctionBegin; 5532 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_old",&olddatastruct,PETSC_NULL);CHKERRQ(ierr); 5533 if (olddatastruct){ 5534 ierr = MatILUFactorSymbolic_SeqBAIJ_inplace(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5535 PetscFunctionReturn(0); 5536 } 5537 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5538 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5539 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5540 5541 f = info->fill; 5542 levels = (PetscInt)info->levels; 5543 diagonal_fill = (PetscInt)info->diagonal_fill; 5544 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5545 5546 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5547 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5548 both_identity = (PetscTruth) (row_identity && col_identity); 5549 5550 if (!levels && both_identity) { 5551 /* special case: ilu(0) with natural ordering */ 5552 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5553 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5554 5555 fact->factor = MAT_FACTOR_ILU; 5556 (fact)->info.factor_mallocs = 0; 5557 (fact)->info.fill_ratio_given = info->fill; 5558 (fact)->info.fill_ratio_needed = 1.0; 5559 b = (Mat_SeqBAIJ*)(fact)->data; 5560 b->row = isrow; 5561 b->col = iscol; 5562 b->icol = isicol; 5563 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5564 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5565 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5566 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5567 PetscFunctionReturn(0); 5568 } 5569 5570 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5571 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5572 5573 /* get new row pointers */ 5574 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5575 bi[0] = 0; 5576 /* bdiag is location of diagonal in factor */ 5577 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5578 bdiag[0] = 0; 5579 5580 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 5581 5582 /* create a linked list for storing column indices of the active row */ 5583 nlnk = n + 1; 5584 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5585 5586 /* initial FreeSpace size is f*(ai[n]+1) */ 5587 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5588 current_space = free_space; 5589 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5590 current_space_lvl = free_space_lvl; 5591 5592 for (i=0; i<n; i++) { 5593 nzi = 0; 5594 /* copy current row into linked list */ 5595 nnz = ai[r[i]+1] - ai[r[i]]; 5596 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5597 cols = aj + ai[r[i]]; 5598 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5599 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5600 nzi += nlnk; 5601 5602 /* make sure diagonal entry is included */ 5603 if (diagonal_fill && lnk[i] == -1) { 5604 fm = n; 5605 while (lnk[fm] < i) fm = lnk[fm]; 5606 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5607 lnk[fm] = i; 5608 lnk_lvl[i] = 0; 5609 nzi++; dcount++; 5610 } 5611 5612 /* add pivot rows into the active row */ 5613 nzbd = 0; 5614 prow = lnk[n]; 5615 while (prow < i) { 5616 nnz = bdiag[prow]; 5617 cols = bj_ptr[prow] + nnz + 1; 5618 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5619 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5620 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5621 nzi += nlnk; 5622 prow = lnk[prow]; 5623 nzbd++; 5624 } 5625 bdiag[i] = nzbd; 5626 bi[i+1] = bi[i] + nzi; 5627 5628 /* if free space is not available, make more free space */ 5629 if (current_space->local_remaining<nzi) { 5630 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5631 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5632 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5633 reallocs++; 5634 } 5635 5636 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5637 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5638 bj_ptr[i] = current_space->array; 5639 bjlvl_ptr[i] = current_space_lvl->array; 5640 5641 /* make sure the active row i has diagonal entry */ 5642 if (*(bj_ptr[i]+bdiag[i]) != i) { 5643 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5644 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5645 } 5646 5647 current_space->array += nzi; 5648 current_space->local_used += nzi; 5649 current_space->local_remaining -= nzi; 5650 current_space_lvl->array += nzi; 5651 current_space_lvl->local_used += nzi; 5652 current_space_lvl->local_remaining -= nzi; 5653 } 5654 5655 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5656 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5657 5658 /* destroy list of free space and other temporary arrays */ 5659 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 5660 5661 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5662 ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 5663 5664 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 5665 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5666 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 5667 5668 #if defined(PETSC_USE_INFO) 5669 { 5670 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 5671 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 5672 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5673 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 5674 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5675 if (diagonal_fill) { 5676 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 5677 } 5678 } 5679 #endif 5680 5681 /* put together the new matrix */ 5682 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5683 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5684 b = (Mat_SeqBAIJ*)(fact)->data; 5685 b->free_a = PETSC_TRUE; 5686 b->free_ij = PETSC_TRUE; 5687 b->singlemalloc = PETSC_FALSE; 5688 ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5689 b->j = bj; 5690 b->i = bi; 5691 b->diag = bdiag; 5692 b->free_diag = PETSC_TRUE; 5693 b->ilen = 0; 5694 b->imax = 0; 5695 b->row = isrow; 5696 b->col = iscol; 5697 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5698 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5699 b->icol = isicol; 5700 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5701 /* In b structure: Free imax, ilen, old a, old j. 5702 Allocate bdiag, solve_work, new a, new j */ 5703 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 5704 b->maxnz = b->nz = bdiag[0]+1; 5705 fact->info.factor_mallocs = reallocs; 5706 fact->info.fill_ratio_given = f; 5707 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 5708 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5709 PetscFunctionReturn(0); 5710 } 5711 5712 5713 /* 5714 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 5715 except that the data structure of Mat_SeqAIJ is slightly different. 5716 Not a good example of code reuse. 5717 */ 5718 #undef __FUNCT__ 5719 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 5720 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5721 { 5722 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5723 IS isicol; 5724 PetscErrorCode ierr; 5725 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 5726 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5727 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5728 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 5729 PetscTruth col_identity,row_identity,both_identity,flg; 5730 PetscReal f; 5731 5732 PetscFunctionBegin; 5733 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 5734 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 5735 5736 f = info->fill; 5737 levels = (PetscInt)info->levels; 5738 diagonal_fill = (PetscInt)info->diagonal_fill; 5739 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5740 5741 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5742 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5743 both_identity = (PetscTruth) (row_identity && col_identity); 5744 5745 if (!levels && both_identity) { /* special case copy the nonzero structure */ 5746 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 5747 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 5748 5749 fact->factor = MAT_FACTOR_ILU; 5750 b = (Mat_SeqBAIJ*)fact->data; 5751 b->row = isrow; 5752 b->col = iscol; 5753 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5754 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5755 b->icol = isicol; 5756 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5757 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5758 PetscFunctionReturn(0); 5759 } 5760 5761 /* general case perform the symbolic factorization */ 5762 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5763 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5764 5765 /* get new row pointers */ 5766 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 5767 ainew[0] = 0; 5768 /* don't know how many column pointers are needed so estimate */ 5769 jmax = (PetscInt)(f*ai[n] + 1); 5770 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 5771 /* ajfill is level of fill for each fill entry */ 5772 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 5773 /* fill is a linked list of nonzeros in active row */ 5774 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 5775 /* im is level for each filled value */ 5776 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 5777 /* dloc is location of diagonal in factor */ 5778 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 5779 dloc[0] = 0; 5780 for (prow=0; prow<n; prow++) { 5781 5782 /* copy prow into linked list */ 5783 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 5784 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 5785 xi = aj + ai[r[prow]]; 5786 fill[n] = n; 5787 fill[prow] = -1; /* marker for diagonal entry */ 5788 while (nz--) { 5789 fm = n; 5790 idx = ic[*xi++]; 5791 do { 5792 m = fm; 5793 fm = fill[m]; 5794 } while (fm < idx); 5795 fill[m] = idx; 5796 fill[idx] = fm; 5797 im[idx] = 0; 5798 } 5799 5800 /* make sure diagonal entry is included */ 5801 if (diagonal_fill && fill[prow] == -1) { 5802 fm = n; 5803 while (fill[fm] < prow) fm = fill[fm]; 5804 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5805 fill[fm] = prow; 5806 im[prow] = 0; 5807 nzf++; 5808 dcount++; 5809 } 5810 5811 nzi = 0; 5812 row = fill[n]; 5813 while (row < prow) { 5814 incrlev = im[row] + 1; 5815 nz = dloc[row]; 5816 xi = ajnew + ainew[row] + nz + 1; 5817 flev = ajfill + ainew[row] + nz + 1; 5818 nnz = ainew[row+1] - ainew[row] - nz - 1; 5819 fm = row; 5820 while (nnz-- > 0) { 5821 idx = *xi++; 5822 if (*flev + incrlev > levels) { 5823 flev++; 5824 continue; 5825 } 5826 do { 5827 m = fm; 5828 fm = fill[m]; 5829 } while (fm < idx); 5830 if (fm != idx) { 5831 im[idx] = *flev + incrlev; 5832 fill[m] = idx; 5833 fill[idx] = fm; 5834 fm = idx; 5835 nzf++; 5836 } else { 5837 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 5838 } 5839 flev++; 5840 } 5841 row = fill[row]; 5842 nzi++; 5843 } 5844 /* copy new filled row into permanent storage */ 5845 ainew[prow+1] = ainew[prow] + nzf; 5846 if (ainew[prow+1] > jmax) { 5847 5848 /* estimate how much additional space we will need */ 5849 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5850 /* just double the memory each time */ 5851 PetscInt maxadd = jmax; 5852 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 5853 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 5854 jmax += maxadd; 5855 5856 /* allocate a longer ajnew and ajfill */ 5857 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5858 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5859 ierr = PetscFree(ajnew);CHKERRQ(ierr); 5860 ajnew = xitmp; 5861 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5862 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5863 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5864 ajfill = xitmp; 5865 reallocate++; /* count how many reallocations are needed */ 5866 } 5867 xitmp = ajnew + ainew[prow]; 5868 flev = ajfill + ainew[prow]; 5869 dloc[prow] = nzi; 5870 fm = fill[n]; 5871 while (nzf--) { 5872 *xitmp++ = fm; 5873 *flev++ = im[fm]; 5874 fm = fill[fm]; 5875 } 5876 /* make sure row has diagonal entry */ 5877 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 5878 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5879 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5880 } 5881 } 5882 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5883 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5884 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5885 ierr = PetscFree(fill);CHKERRQ(ierr); 5886 ierr = PetscFree(im);CHKERRQ(ierr); 5887 5888 #if defined(PETSC_USE_INFO) 5889 { 5890 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5891 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5892 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5893 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5894 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5895 if (diagonal_fill) { 5896 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5897 } 5898 } 5899 #endif 5900 5901 /* put together the new matrix */ 5902 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5903 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5904 b = (Mat_SeqBAIJ*)fact->data; 5905 b->free_a = PETSC_TRUE; 5906 b->free_ij = PETSC_TRUE; 5907 b->singlemalloc = PETSC_FALSE; 5908 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5909 b->j = ajnew; 5910 b->i = ainew; 5911 for (i=0; i<n; i++) dloc[i] += ainew[i]; 5912 b->diag = dloc; 5913 b->free_diag = PETSC_TRUE; 5914 b->ilen = 0; 5915 b->imax = 0; 5916 b->row = isrow; 5917 b->col = iscol; 5918 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5919 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5920 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5921 b->icol = isicol; 5922 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5923 /* In b structure: Free imax, ilen, old a, old j. 5924 Allocate dloc, solve_work, new a, new j */ 5925 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 5926 b->maxnz = b->nz = ainew[n]; 5927 5928 fact->info.factor_mallocs = reallocate; 5929 fact->info.fill_ratio_given = f; 5930 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 5931 5932 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 5933 PetscFunctionReturn(0); 5934 } 5935 5936 #undef __FUNCT__ 5937 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5938 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 5939 { 5940 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 5941 /* int i,*AJ=a->j,nz=a->nz; */ 5942 PetscFunctionBegin; 5943 /* Undo Column scaling */ 5944 /* while (nz--) { */ 5945 /* AJ[i] = AJ[i]/4; */ 5946 /* } */ 5947 /* This should really invoke a push/pop logic, but we don't have that yet. */ 5948 A->ops->setunfactored = PETSC_NULL; 5949 PetscFunctionReturn(0); 5950 } 5951 5952 #undef __FUNCT__ 5953 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5954 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 5955 { 5956 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5957 PetscInt *AJ=a->j,nz=a->nz; 5958 unsigned short *aj=(unsigned short *)AJ; 5959 PetscFunctionBegin; 5960 /* Is this really necessary? */ 5961 while (nz--) { 5962 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 5963 } 5964 A->ops->setunfactored = PETSC_NULL; 5965 PetscFunctionReturn(0); 5966 } 5967 5968 5969