1 #define PETSCMAT_DLL 2 3 /* 4 Factorization code for BAIJ format. 5 */ 6 7 #include "../src/mat/impls/baij/seq/baij.h" 8 #include "../src/mat/blockinvert.h" 9 #include "petscbt.h" 10 #include "../src/mat/utils/freespace.h" 11 12 #undef __FUNCT__ 13 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace" 14 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 15 { 16 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 17 PetscErrorCode ierr; 18 PetscInt i,nz; 19 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 20 const MatScalar *aa=a->a,*v; 21 PetscScalar s1,*x; 22 const PetscScalar *b; 23 24 PetscFunctionBegin; 25 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 26 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 29 /* forward solve the U^T */ 30 for (i=0; i<n; i++) { 31 32 v = aa + diag[i]; 33 /* multiply by the inverse of the block diagonal */ 34 s1 = (*v++)*x[i]; 35 vi = aj + diag[i] + 1; 36 nz = ai[i+1] - diag[i] - 1; 37 while (nz--) { 38 x[*vi++] -= (*v++)*s1; 39 } 40 x[i] = s1; 41 } 42 /* backward solve the L^T */ 43 for (i=n-1; i>=0; i--){ 44 v = aa + diag[i] - 1; 45 vi = aj + diag[i] - 1; 46 nz = diag[i] - ai[i]; 47 s1 = x[i]; 48 while (nz--) { 49 x[*vi--] -= (*v--)*s1; 50 } 51 } 52 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 53 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55 PetscFunctionReturn(0); 56 } 57 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace" 60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63 PetscErrorCode ierr; 64 PetscInt i,nz,idx,idt,oidx; 65 const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 66 const MatScalar *aa=a->a,*v; 67 PetscScalar s1,s2,x1,x2,*x; 68 const PetscScalar *b; 69 70 PetscFunctionBegin; 71 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 73 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74 75 /* forward solve the U^T */ 76 idx = 0; 77 for (i=0; i<n; i++) { 78 79 v = aa + 4*diag[i]; 80 /* multiply by the inverse of the block diagonal */ 81 x1 = x[idx]; x2 = x[1+idx]; 82 s1 = v[0]*x1 + v[1]*x2; 83 s2 = v[2]*x1 + v[3]*x2; 84 v += 4; 85 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 oidx = 2*(*vi++); 90 x[oidx] -= v[0]*s1 + v[1]*s2; 91 x[oidx+1] -= v[2]*s1 + v[3]*s2; 92 v += 4; 93 } 94 x[idx] = s1;x[1+idx] = s2; 95 idx += 2; 96 } 97 /* backward solve the L^T */ 98 for (i=n-1; i>=0; i--){ 99 v = aa + 4*diag[i] - 4; 100 vi = aj + diag[i] - 1; 101 nz = diag[i] - ai[i]; 102 idt = 2*i; 103 s1 = x[idt]; s2 = x[1+idt]; 104 while (nz--) { 105 idx = 2*(*vi--); 106 x[idx] -= v[0]*s1 + v[1]*s2; 107 x[idx+1] -= v[2]*s1 + v[3]*s2; 108 v -= 4; 109 } 110 } 111 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 120 { 121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122 PetscErrorCode ierr; 123 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 124 PetscInt nz,idx,idt,j,i,oidx; 125 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 126 const MatScalar *aa=a->a,*v; 127 PetscScalar s1,s2,x1,x2,*x; 128 const PetscScalar *b; 129 130 PetscFunctionBegin; 131 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 132 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 133 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 134 135 /* forward solve the U^T */ 136 idx = 0; 137 for (i=0; i<n; i++) { 138 v = aa + bs2*diag[i]; 139 /* multiply by the inverse of the block diagonal */ 140 x1 = x[idx]; x2 = x[1+idx]; 141 s1 = v[0]*x1 + v[1]*x2; 142 s2 = v[2]*x1 + v[3]*x2; 143 v -= bs2; 144 145 vi = aj + diag[i] - 1; 146 nz = diag[i] - diag[i+1] - 1; 147 for(j=0;j>-nz;j--){ 148 oidx = bs*vi[j]; 149 x[oidx] -= v[0]*s1 + v[1]*s2; 150 x[oidx+1] -= v[2]*s1 + v[3]*s2; 151 v -= bs2; 152 } 153 x[idx] = s1;x[1+idx] = s2; 154 idx += bs; 155 } 156 /* backward solve the L^T */ 157 for (i=n-1; i>=0; i--){ 158 v = aa + bs2*ai[i]; 159 vi = aj + ai[i]; 160 nz = ai[i+1] - ai[i]; 161 idt = bs*i; 162 s1 = x[idt]; s2 = x[1+idt]; 163 for(j=0;j<nz;j++){ 164 idx = bs*vi[j]; 165 x[idx] -= v[0]*s1 + v[1]*s2; 166 x[idx+1] -= v[2]*s1 + v[3]*s2; 167 v += bs2; 168 } 169 } 170 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 172 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace" 178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 179 { 180 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 181 PetscErrorCode ierr; 182 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 183 PetscInt i,nz,idx,idt,oidx; 184 const MatScalar *aa=a->a,*v; 185 PetscScalar s1,s2,s3,x1,x2,x3,*x; 186 const PetscScalar *b; 187 188 PetscFunctionBegin; 189 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 190 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 191 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192 193 /* forward solve the U^T */ 194 idx = 0; 195 for (i=0; i<n; i++) { 196 197 v = aa + 9*diag[i]; 198 /* multiply by the inverse of the block diagonal */ 199 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 200 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 201 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 202 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 203 v += 9; 204 205 vi = aj + diag[i] + 1; 206 nz = ai[i+1] - diag[i] - 1; 207 while (nz--) { 208 oidx = 3*(*vi++); 209 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 210 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 211 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 212 v += 9; 213 } 214 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 215 idx += 3; 216 } 217 /* backward solve the L^T */ 218 for (i=n-1; i>=0; i--){ 219 v = aa + 9*diag[i] - 9; 220 vi = aj + diag[i] - 1; 221 nz = diag[i] - ai[i]; 222 idt = 3*i; 223 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 224 while (nz--) { 225 idx = 3*(*vi--); 226 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 227 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 228 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 229 v -= 9; 230 } 231 } 232 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 233 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 240 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 241 { 242 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 243 PetscErrorCode ierr; 244 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 245 PetscInt nz,idx,idt,j,i,oidx; 246 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 247 const MatScalar *aa=a->a,*v; 248 PetscScalar s1,s2,s3,x1,x2,x3,*x; 249 const PetscScalar *b; 250 251 PetscFunctionBegin; 252 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 253 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 254 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 255 256 /* forward solve the U^T */ 257 idx = 0; 258 for (i=0; i<n; i++) { 259 v = aa + bs2*diag[i]; 260 /* multiply by the inverse of the block diagonal */ 261 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 262 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 263 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 264 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 265 v -= bs2; 266 267 vi = aj + diag[i] - 1; 268 nz = diag[i] - diag[i+1] - 1; 269 for(j=0;j>-nz;j--){ 270 oidx = bs*vi[j]; 271 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 272 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 273 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 274 v -= bs2; 275 } 276 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 277 idx += bs; 278 } 279 /* backward solve the L^T */ 280 for (i=n-1; i>=0; i--){ 281 v = aa + bs2*ai[i]; 282 vi = aj + ai[i]; 283 nz = ai[i+1] - ai[i]; 284 idt = bs*i; 285 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 286 for(j=0;j<nz;j++){ 287 idx = bs*vi[j]; 288 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 289 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 290 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 291 v += bs2; 292 } 293 } 294 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 295 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 296 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 297 PetscFunctionReturn(0); 298 } 299 300 #undef __FUNCT__ 301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace" 302 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 303 { 304 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 305 PetscErrorCode ierr; 306 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 307 PetscInt i,nz,idx,idt,oidx; 308 const MatScalar *aa=a->a,*v; 309 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 310 const PetscScalar *b; 311 312 PetscFunctionBegin; 313 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 314 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 315 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 316 317 /* forward solve the U^T */ 318 idx = 0; 319 for (i=0; i<n; i++) { 320 321 v = aa + 16*diag[i]; 322 /* multiply by the inverse of the block diagonal */ 323 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 324 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 325 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 326 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 327 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 328 v += 16; 329 330 vi = aj + diag[i] + 1; 331 nz = ai[i+1] - diag[i] - 1; 332 while (nz--) { 333 oidx = 4*(*vi++); 334 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 335 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 336 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 337 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 338 v += 16; 339 } 340 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 341 idx += 4; 342 } 343 /* backward solve the L^T */ 344 for (i=n-1; i>=0; i--){ 345 v = aa + 16*diag[i] - 16; 346 vi = aj + diag[i] - 1; 347 nz = diag[i] - ai[i]; 348 idt = 4*i; 349 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 350 while (nz--) { 351 idx = 4*(*vi--); 352 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 353 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 354 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 355 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 356 v -= 16; 357 } 358 } 359 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 360 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 361 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 362 PetscFunctionReturn(0); 363 } 364 365 #undef __FUNCT__ 366 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 367 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 368 { 369 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 370 PetscErrorCode ierr; 371 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 372 PetscInt nz,idx,idt,j,i,oidx; 373 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 374 const MatScalar *aa=a->a,*v; 375 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x; 376 const PetscScalar *b; 377 378 PetscFunctionBegin; 379 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 380 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 381 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 382 383 /* forward solve the U^T */ 384 idx = 0; 385 for (i=0; i<n; i++) { 386 v = aa + bs2*diag[i]; 387 /* multiply by the inverse of the block diagonal */ 388 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 389 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 390 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 391 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 392 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 393 v -= bs2; 394 395 vi = aj + diag[i] - 1; 396 nz = diag[i] - diag[i+1] - 1; 397 for(j=0;j>-nz;j--){ 398 oidx = bs*vi[j]; 399 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 400 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 401 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 402 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 403 v -= bs2; 404 } 405 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 406 idx += bs; 407 } 408 /* backward solve the L^T */ 409 for (i=n-1; i>=0; i--){ 410 v = aa + bs2*ai[i]; 411 vi = aj + ai[i]; 412 nz = ai[i+1] - ai[i]; 413 idt = bs*i; 414 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 415 for(j=0;j<nz;j++){ 416 idx = bs*vi[j]; 417 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 418 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 419 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 420 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 421 v += bs2; 422 } 423 } 424 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 425 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 426 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 427 PetscFunctionReturn(0); 428 } 429 430 #undef __FUNCT__ 431 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace" 432 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 433 { 434 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 435 PetscErrorCode ierr; 436 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 437 PetscInt i,nz,idx,idt,oidx; 438 const MatScalar *aa=a->a,*v; 439 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 440 const PetscScalar *b; 441 442 PetscFunctionBegin; 443 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 444 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 445 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 446 447 /* forward solve the U^T */ 448 idx = 0; 449 for (i=0; i<n; i++) { 450 451 v = aa + 25*diag[i]; 452 /* multiply by the inverse of the block diagonal */ 453 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 454 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 455 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 456 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 457 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 458 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 459 v += 25; 460 461 vi = aj + diag[i] + 1; 462 nz = ai[i+1] - diag[i] - 1; 463 while (nz--) { 464 oidx = 5*(*vi++); 465 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 466 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 467 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 468 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 469 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 470 v += 25; 471 } 472 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 473 idx += 5; 474 } 475 /* backward solve the L^T */ 476 for (i=n-1; i>=0; i--){ 477 v = aa + 25*diag[i] - 25; 478 vi = aj + diag[i] - 1; 479 nz = diag[i] - ai[i]; 480 idt = 5*i; 481 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 482 while (nz--) { 483 idx = 5*(*vi--); 484 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 485 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 486 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 487 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 488 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 489 v -= 25; 490 } 491 } 492 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 493 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 494 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 #undef __FUNCT__ 499 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 500 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 501 { 502 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 503 PetscErrorCode ierr; 504 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 505 PetscInt nz,idx,idt,j,i,oidx; 506 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 507 const MatScalar *aa=a->a,*v; 508 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x; 509 const PetscScalar *b; 510 511 PetscFunctionBegin; 512 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 513 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 514 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 515 516 /* forward solve the U^T */ 517 idx = 0; 518 for (i=0; i<n; i++) { 519 v = aa + bs2*diag[i]; 520 /* multiply by the inverse of the block diagonal */ 521 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 522 x5 = x[4+idx]; 523 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 524 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 525 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 526 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 527 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 528 v -= bs2; 529 530 vi = aj + diag[i] - 1; 531 nz = diag[i] - diag[i+1] - 1; 532 for(j=0;j>-nz;j--){ 533 oidx = bs*vi[j]; 534 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 535 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 536 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 537 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 538 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 539 v -= bs2; 540 } 541 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 542 idx += bs; 543 } 544 /* backward solve the L^T */ 545 for (i=n-1; i>=0; i--){ 546 v = aa + bs2*ai[i]; 547 vi = aj + ai[i]; 548 nz = ai[i+1] - ai[i]; 549 idt = bs*i; 550 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 551 for(j=0;j<nz;j++){ 552 idx = bs*vi[j]; 553 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 554 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 555 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 556 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 557 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 558 v += bs2; 559 } 560 } 561 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 562 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 563 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 564 PetscFunctionReturn(0); 565 } 566 567 #undef __FUNCT__ 568 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace" 569 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 570 { 571 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 572 PetscErrorCode ierr; 573 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 574 PetscInt i,nz,idx,idt,oidx; 575 const MatScalar *aa=a->a,*v; 576 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 577 const PetscScalar *b; 578 579 PetscFunctionBegin; 580 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 581 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 582 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 583 584 /* forward solve the U^T */ 585 idx = 0; 586 for (i=0; i<n; i++) { 587 588 v = aa + 36*diag[i]; 589 /* multiply by the inverse of the block diagonal */ 590 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 591 x6 = x[5+idx]; 592 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 593 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 594 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 595 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 596 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 597 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 598 v += 36; 599 600 vi = aj + diag[i] + 1; 601 nz = ai[i+1] - diag[i] - 1; 602 while (nz--) { 603 oidx = 6*(*vi++); 604 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 605 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 606 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 607 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 608 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 609 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 610 v += 36; 611 } 612 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 613 x[5+idx] = s6; 614 idx += 6; 615 } 616 /* backward solve the L^T */ 617 for (i=n-1; i>=0; i--){ 618 v = aa + 36*diag[i] - 36; 619 vi = aj + diag[i] - 1; 620 nz = diag[i] - ai[i]; 621 idt = 6*i; 622 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 623 s6 = x[5+idt]; 624 while (nz--) { 625 idx = 6*(*vi--); 626 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632 v -= 36; 633 } 634 } 635 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 636 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 637 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 638 PetscFunctionReturn(0); 639 } 640 641 #undef __FUNCT__ 642 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 643 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 644 { 645 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 646 PetscErrorCode ierr; 647 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 648 PetscInt nz,idx,idt,j,i,oidx; 649 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 650 const MatScalar *aa=a->a,*v; 651 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x; 652 const PetscScalar *b; 653 654 PetscFunctionBegin; 655 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 656 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 657 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 658 659 /* forward solve the U^T */ 660 idx = 0; 661 for (i=0; i<n; i++) { 662 v = aa + bs2*diag[i]; 663 /* multiply by the inverse of the block diagonal */ 664 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 665 x5 = x[4+idx]; x6 = x[5+idx]; 666 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 667 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 668 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 669 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 670 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 671 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 672 v -= bs2; 673 674 vi = aj + diag[i] - 1; 675 nz = diag[i] - diag[i+1] - 1; 676 for(j=0;j>-nz;j--){ 677 oidx = bs*vi[j]; 678 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 679 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 680 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 681 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 682 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 683 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 684 v -= bs2; 685 } 686 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 687 x[5+idx] = s6; 688 idx += bs; 689 } 690 /* backward solve the L^T */ 691 for (i=n-1; i>=0; i--){ 692 v = aa + bs2*ai[i]; 693 vi = aj + ai[i]; 694 nz = ai[i+1] - ai[i]; 695 idt = bs*i; 696 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 697 s6 = x[5+idt]; 698 for(j=0;j<nz;j++){ 699 idx = bs*vi[j]; 700 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 701 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 702 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 703 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 704 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 705 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 706 v += bs2; 707 } 708 } 709 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 710 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 711 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 712 PetscFunctionReturn(0); 713 } 714 715 #undef __FUNCT__ 716 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace" 717 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 718 { 719 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 720 PetscErrorCode ierr; 721 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 722 PetscInt i,nz,idx,idt,oidx; 723 const MatScalar *aa=a->a,*v; 724 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 725 const PetscScalar *b; 726 727 PetscFunctionBegin; 728 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 729 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 730 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 731 732 /* forward solve the U^T */ 733 idx = 0; 734 for (i=0; i<n; i++) { 735 736 v = aa + 49*diag[i]; 737 /* multiply by the inverse of the block diagonal */ 738 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 739 x6 = x[5+idx]; x7 = x[6+idx]; 740 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 741 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 742 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 743 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 744 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 745 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 746 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 747 v += 49; 748 749 vi = aj + diag[i] + 1; 750 nz = ai[i+1] - diag[i] - 1; 751 while (nz--) { 752 oidx = 7*(*vi++); 753 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 754 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 755 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 756 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 757 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 758 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 759 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 760 v += 49; 761 } 762 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 763 x[5+idx] = s6;x[6+idx] = s7; 764 idx += 7; 765 } 766 /* backward solve the L^T */ 767 for (i=n-1; i>=0; i--){ 768 v = aa + 49*diag[i] - 49; 769 vi = aj + diag[i] - 1; 770 nz = diag[i] - ai[i]; 771 idt = 7*i; 772 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 773 s6 = x[5+idt];s7 = x[6+idt]; 774 while (nz--) { 775 idx = 7*(*vi--); 776 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 777 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 778 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 779 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 780 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 781 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 782 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 783 v -= 49; 784 } 785 } 786 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 787 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 788 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 789 PetscFunctionReturn(0); 790 } 791 #undef __FUNCT__ 792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 793 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 794 { 795 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 796 PetscErrorCode ierr; 797 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 798 PetscInt nz,idx,idt,j,i,oidx; 799 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 800 const MatScalar *aa=a->a,*v; 801 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x; 802 const PetscScalar *b; 803 804 PetscFunctionBegin; 805 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 806 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 807 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 808 809 /* forward solve the U^T */ 810 idx = 0; 811 for (i=0; i<n; i++) { 812 v = aa + bs2*diag[i]; 813 /* multiply by the inverse of the block diagonal */ 814 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 815 x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 816 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 817 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 818 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 819 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 820 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 821 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 822 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 823 v -= bs2; 824 vi = aj + diag[i] - 1; 825 nz = diag[i] - diag[i+1] - 1; 826 for(j=0;j>-nz;j--){ 827 oidx = bs*vi[j]; 828 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 829 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 830 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 831 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 832 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 833 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 834 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 835 v -= bs2; 836 } 837 x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 838 x[5+idx] = s6; x[6+idx] = s7; 839 idx += bs; 840 } 841 /* backward solve the L^T */ 842 for (i=n-1; i>=0; i--){ 843 v = aa + bs2*ai[i]; 844 vi = aj + ai[i]; 845 nz = ai[i+1] - ai[i]; 846 idt = bs*i; 847 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 848 s6 = x[5+idt]; s7 = x[6+idt]; 849 for(j=0;j<nz;j++){ 850 idx = bs*vi[j]; 851 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 852 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 853 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 854 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 855 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 856 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 857 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 858 v += bs2; 859 } 860 } 861 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 862 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 863 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 864 PetscFunctionReturn(0); 865 } 866 867 /*---------------------------------------------------------------------------------------------*/ 868 #undef __FUNCT__ 869 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace" 870 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 871 { 872 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 873 IS iscol=a->col,isrow=a->row; 874 PetscErrorCode ierr; 875 const PetscInt *r,*c,*rout,*cout; 876 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 877 PetscInt i,nz; 878 const MatScalar *aa=a->a,*v; 879 PetscScalar s1,*x,*t; 880 const PetscScalar *b; 881 882 PetscFunctionBegin; 883 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 884 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 885 t = a->solve_work; 886 887 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 888 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 889 890 /* copy the b into temp work space according to permutation */ 891 for (i=0; i<n; i++) { 892 t[i] = b[c[i]]; 893 } 894 895 /* forward solve the U^T */ 896 for (i=0; i<n; i++) { 897 898 v = aa + diag[i]; 899 /* multiply by the inverse of the block diagonal */ 900 s1 = (*v++)*t[i]; 901 vi = aj + diag[i] + 1; 902 nz = ai[i+1] - diag[i] - 1; 903 while (nz--) { 904 t[*vi++] -= (*v++)*s1; 905 } 906 t[i] = s1; 907 } 908 /* backward solve the L^T */ 909 for (i=n-1; i>=0; i--){ 910 v = aa + diag[i] - 1; 911 vi = aj + diag[i] - 1; 912 nz = diag[i] - ai[i]; 913 s1 = t[i]; 914 while (nz--) { 915 t[*vi--] -= (*v--)*s1; 916 } 917 } 918 919 /* copy t into x according to permutation */ 920 for (i=0; i<n; i++) { 921 x[r[i]] = t[i]; 922 } 923 924 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 925 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 926 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 927 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 928 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 929 PetscFunctionReturn(0); 930 } 931 932 #undef __FUNCT__ 933 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace" 934 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 935 { 936 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 937 IS iscol=a->col,isrow=a->row; 938 PetscErrorCode ierr; 939 const PetscInt *r,*c,*rout,*cout; 940 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 941 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 942 const MatScalar *aa=a->a,*v; 943 PetscScalar s1,s2,x1,x2,*x,*t; 944 const PetscScalar *b; 945 946 PetscFunctionBegin; 947 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 948 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 949 t = a->solve_work; 950 951 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 952 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 953 954 /* copy the b into temp work space according to permutation */ 955 ii = 0; 956 for (i=0; i<n; i++) { 957 ic = 2*c[i]; 958 t[ii] = b[ic]; 959 t[ii+1] = b[ic+1]; 960 ii += 2; 961 } 962 963 /* forward solve the U^T */ 964 idx = 0; 965 for (i=0; i<n; i++) { 966 967 v = aa + 4*diag[i]; 968 /* multiply by the inverse of the block diagonal */ 969 x1 = t[idx]; x2 = t[1+idx]; 970 s1 = v[0]*x1 + v[1]*x2; 971 s2 = v[2]*x1 + v[3]*x2; 972 v += 4; 973 974 vi = aj + diag[i] + 1; 975 nz = ai[i+1] - diag[i] - 1; 976 while (nz--) { 977 oidx = 2*(*vi++); 978 t[oidx] -= v[0]*s1 + v[1]*s2; 979 t[oidx+1] -= v[2]*s1 + v[3]*s2; 980 v += 4; 981 } 982 t[idx] = s1;t[1+idx] = s2; 983 idx += 2; 984 } 985 /* backward solve the L^T */ 986 for (i=n-1; i>=0; i--){ 987 v = aa + 4*diag[i] - 4; 988 vi = aj + diag[i] - 1; 989 nz = diag[i] - ai[i]; 990 idt = 2*i; 991 s1 = t[idt]; s2 = t[1+idt]; 992 while (nz--) { 993 idx = 2*(*vi--); 994 t[idx] -= v[0]*s1 + v[1]*s2; 995 t[idx+1] -= v[2]*s1 + v[3]*s2; 996 v -= 4; 997 } 998 } 999 1000 /* copy t into x according to permutation */ 1001 ii = 0; 1002 for (i=0; i<n; i++) { 1003 ir = 2*r[i]; 1004 x[ir] = t[ii]; 1005 x[ir+1] = t[ii+1]; 1006 ii += 2; 1007 } 1008 1009 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1010 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1011 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1012 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1013 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1014 PetscFunctionReturn(0); 1015 } 1016 1017 #undef __FUNCT__ 1018 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 1019 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 1020 { 1021 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1022 PetscErrorCode ierr; 1023 IS iscol=a->col,isrow=a->row; 1024 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1025 const PetscInt *r,*c,*rout,*cout; 1026 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1027 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1028 const MatScalar *aa=a->a,*v; 1029 PetscScalar s1,s2,x1,x2,*x,*t; 1030 const PetscScalar *b; 1031 1032 PetscFunctionBegin; 1033 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1034 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1035 t = a->solve_work; 1036 1037 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1038 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1039 1040 /* copy b into temp work space according to permutation */ 1041 for(i=0;i<n;i++){ 1042 ii = bs*i; ic = bs*c[i]; 1043 t[ii] = b[ic]; t[ii+1] = b[ic+1]; 1044 } 1045 1046 /* forward solve the U^T */ 1047 idx = 0; 1048 for (i=0; i<n; i++) { 1049 v = aa + bs2*diag[i]; 1050 /* multiply by the inverse of the block diagonal */ 1051 x1 = t[idx]; x2 = t[1+idx]; 1052 s1 = v[0]*x1 + v[1]*x2; 1053 s2 = v[2]*x1 + v[3]*x2; 1054 v -= bs2; 1055 1056 vi = aj + diag[i] - 1; 1057 nz = diag[i] - diag[i+1] - 1; 1058 for(j=0;j>-nz;j--){ 1059 oidx = bs*vi[j]; 1060 t[oidx] -= v[0]*s1 + v[1]*s2; 1061 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1062 v -= bs2; 1063 } 1064 t[idx] = s1;t[1+idx] = s2; 1065 idx += bs; 1066 } 1067 /* backward solve the L^T */ 1068 for (i=n-1; i>=0; i--){ 1069 v = aa + bs2*ai[i]; 1070 vi = aj + ai[i]; 1071 nz = ai[i+1] - ai[i]; 1072 idt = bs*i; 1073 s1 = t[idt]; s2 = t[1+idt]; 1074 for(j=0;j<nz;j++){ 1075 idx = bs*vi[j]; 1076 t[idx] -= v[0]*s1 + v[1]*s2; 1077 t[idx+1] -= v[2]*s1 + v[3]*s2; 1078 v += bs2; 1079 } 1080 } 1081 1082 /* copy t into x according to permutation */ 1083 for(i=0;i<n;i++){ 1084 ii = bs*i; ir = bs*r[i]; 1085 x[ir] = t[ii]; x[ir+1] = t[ii+1]; 1086 } 1087 1088 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1089 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1090 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1091 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1092 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1093 PetscFunctionReturn(0); 1094 } 1095 1096 #undef __FUNCT__ 1097 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace" 1098 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 1099 { 1100 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1101 IS iscol=a->col,isrow=a->row; 1102 PetscErrorCode ierr; 1103 const PetscInt *r,*c,*rout,*cout; 1104 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1105 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1106 const MatScalar *aa=a->a,*v; 1107 PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1108 const PetscScalar *b; 1109 1110 PetscFunctionBegin; 1111 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1112 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1113 t = a->solve_work; 1114 1115 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1116 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1117 1118 /* copy the b into temp work space according to permutation */ 1119 ii = 0; 1120 for (i=0; i<n; i++) { 1121 ic = 3*c[i]; 1122 t[ii] = b[ic]; 1123 t[ii+1] = b[ic+1]; 1124 t[ii+2] = b[ic+2]; 1125 ii += 3; 1126 } 1127 1128 /* forward solve the U^T */ 1129 idx = 0; 1130 for (i=0; i<n; i++) { 1131 1132 v = aa + 9*diag[i]; 1133 /* multiply by the inverse of the block diagonal */ 1134 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1135 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1136 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1137 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1138 v += 9; 1139 1140 vi = aj + diag[i] + 1; 1141 nz = ai[i+1] - diag[i] - 1; 1142 while (nz--) { 1143 oidx = 3*(*vi++); 1144 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1145 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1146 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1147 v += 9; 1148 } 1149 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1150 idx += 3; 1151 } 1152 /* backward solve the L^T */ 1153 for (i=n-1; i>=0; i--){ 1154 v = aa + 9*diag[i] - 9; 1155 vi = aj + diag[i] - 1; 1156 nz = diag[i] - ai[i]; 1157 idt = 3*i; 1158 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1159 while (nz--) { 1160 idx = 3*(*vi--); 1161 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1162 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1163 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1164 v -= 9; 1165 } 1166 } 1167 1168 /* copy t into x according to permutation */ 1169 ii = 0; 1170 for (i=0; i<n; i++) { 1171 ir = 3*r[i]; 1172 x[ir] = t[ii]; 1173 x[ir+1] = t[ii+1]; 1174 x[ir+2] = t[ii+2]; 1175 ii += 3; 1176 } 1177 1178 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1179 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1180 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1181 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1182 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1183 PetscFunctionReturn(0); 1184 } 1185 1186 #undef __FUNCT__ 1187 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 1188 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1189 { 1190 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1191 PetscErrorCode ierr; 1192 IS iscol=a->col,isrow=a->row; 1193 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1194 const PetscInt *r,*c,*rout,*cout; 1195 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1196 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1197 const MatScalar *aa=a->a,*v; 1198 PetscScalar s1,s2,s3,x1,x2,x3,*x,*t; 1199 const PetscScalar *b; 1200 1201 PetscFunctionBegin; 1202 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1203 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1204 t = a->solve_work; 1205 1206 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1207 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1208 1209 /* copy b into temp work space according to permutation */ 1210 for(i=0;i<n;i++){ 1211 ii = bs*i; ic = bs*c[i]; 1212 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 1213 } 1214 1215 /* forward solve the U^T */ 1216 idx = 0; 1217 for (i=0; i<n; i++) { 1218 v = aa + bs2*diag[i]; 1219 /* multiply by the inverse of the block diagonal */ 1220 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1221 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1222 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1223 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1224 v -= bs2; 1225 1226 vi = aj + diag[i] - 1; 1227 nz = diag[i] - diag[i+1] - 1; 1228 for(j=0;j>-nz;j--){ 1229 oidx = bs*vi[j]; 1230 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1231 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1232 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1233 v -= bs2; 1234 } 1235 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1236 idx += bs; 1237 } 1238 /* backward solve the L^T */ 1239 for (i=n-1; i>=0; i--){ 1240 v = aa + bs2*ai[i]; 1241 vi = aj + ai[i]; 1242 nz = ai[i+1] - ai[i]; 1243 idt = bs*i; 1244 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1245 for(j=0;j<nz;j++){ 1246 idx = bs*vi[j]; 1247 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1248 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1249 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1250 v += bs2; 1251 } 1252 } 1253 1254 /* copy t into x according to permutation */ 1255 for(i=0;i<n;i++){ 1256 ii = bs*i; ir = bs*r[i]; 1257 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 1258 } 1259 1260 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1261 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1262 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1263 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1264 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1265 PetscFunctionReturn(0); 1266 } 1267 1268 #undef __FUNCT__ 1269 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace" 1270 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 1271 { 1272 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1273 IS iscol=a->col,isrow=a->row; 1274 PetscErrorCode ierr; 1275 const PetscInt *r,*c,*rout,*cout; 1276 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1277 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1278 const MatScalar *aa=a->a,*v; 1279 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1280 const PetscScalar *b; 1281 1282 PetscFunctionBegin; 1283 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1284 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1285 t = a->solve_work; 1286 1287 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1288 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1289 1290 /* copy the b into temp work space according to permutation */ 1291 ii = 0; 1292 for (i=0; i<n; i++) { 1293 ic = 4*c[i]; 1294 t[ii] = b[ic]; 1295 t[ii+1] = b[ic+1]; 1296 t[ii+2] = b[ic+2]; 1297 t[ii+3] = b[ic+3]; 1298 ii += 4; 1299 } 1300 1301 /* forward solve the U^T */ 1302 idx = 0; 1303 for (i=0; i<n; i++) { 1304 1305 v = aa + 16*diag[i]; 1306 /* multiply by the inverse of the block diagonal */ 1307 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1308 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1309 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1310 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1311 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1312 v += 16; 1313 1314 vi = aj + diag[i] + 1; 1315 nz = ai[i+1] - diag[i] - 1; 1316 while (nz--) { 1317 oidx = 4*(*vi++); 1318 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1319 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1320 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1321 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1322 v += 16; 1323 } 1324 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1325 idx += 4; 1326 } 1327 /* backward solve the L^T */ 1328 for (i=n-1; i>=0; i--){ 1329 v = aa + 16*diag[i] - 16; 1330 vi = aj + diag[i] - 1; 1331 nz = diag[i] - ai[i]; 1332 idt = 4*i; 1333 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1334 while (nz--) { 1335 idx = 4*(*vi--); 1336 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1337 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1338 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1339 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1340 v -= 16; 1341 } 1342 } 1343 1344 /* copy t into x according to permutation */ 1345 ii = 0; 1346 for (i=0; i<n; i++) { 1347 ir = 4*r[i]; 1348 x[ir] = t[ii]; 1349 x[ir+1] = t[ii+1]; 1350 x[ir+2] = t[ii+2]; 1351 x[ir+3] = t[ii+3]; 1352 ii += 4; 1353 } 1354 1355 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1356 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1357 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1358 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1359 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1360 PetscFunctionReturn(0); 1361 } 1362 1363 #undef __FUNCT__ 1364 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 1365 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1366 { 1367 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1368 PetscErrorCode ierr; 1369 IS iscol=a->col,isrow=a->row; 1370 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1371 const PetscInt *r,*c,*rout,*cout; 1372 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1373 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1374 const MatScalar *aa=a->a,*v; 1375 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4,*x,*t; 1376 const PetscScalar *b; 1377 1378 PetscFunctionBegin; 1379 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1380 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1381 t = a->solve_work; 1382 1383 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1384 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1385 1386 /* copy b into temp work space according to permutation */ 1387 for(i=0;i<n;i++){ 1388 ii = bs*i; ic = bs*c[i]; 1389 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1390 } 1391 1392 /* forward solve the U^T */ 1393 idx = 0; 1394 for (i=0; i<n; i++) { 1395 v = aa + bs2*diag[i]; 1396 /* multiply by the inverse of the block diagonal */ 1397 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1398 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1399 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1400 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1401 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1402 v -= bs2; 1403 1404 vi = aj + diag[i] - 1; 1405 nz = diag[i] - diag[i+1] - 1; 1406 for(j=0;j>-nz;j--){ 1407 oidx = bs*vi[j]; 1408 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1409 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1410 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1411 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1412 v -= bs2; 1413 } 1414 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 1415 idx += bs; 1416 } 1417 /* backward solve the L^T */ 1418 for (i=n-1; i>=0; i--){ 1419 v = aa + bs2*ai[i]; 1420 vi = aj + ai[i]; 1421 nz = ai[i+1] - ai[i]; 1422 idt = bs*i; 1423 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 1424 for(j=0;j<nz;j++){ 1425 idx = bs*vi[j]; 1426 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1427 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1428 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1429 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1430 v += bs2; 1431 } 1432 } 1433 1434 /* copy t into x according to permutation */ 1435 for(i=0;i<n;i++){ 1436 ii = bs*i; ir = bs*r[i]; 1437 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1438 } 1439 1440 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1441 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1442 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1443 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1444 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1445 PetscFunctionReturn(0); 1446 } 1447 1448 #undef __FUNCT__ 1449 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace" 1450 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 1451 { 1452 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1453 IS iscol=a->col,isrow=a->row; 1454 PetscErrorCode ierr; 1455 const PetscInt *r,*c,*rout,*cout; 1456 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1457 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1458 const MatScalar *aa=a->a,*v; 1459 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1460 const PetscScalar *b; 1461 1462 PetscFunctionBegin; 1463 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1464 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1465 t = a->solve_work; 1466 1467 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1468 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1469 1470 /* copy the b into temp work space according to permutation */ 1471 ii = 0; 1472 for (i=0; i<n; i++) { 1473 ic = 5*c[i]; 1474 t[ii] = b[ic]; 1475 t[ii+1] = b[ic+1]; 1476 t[ii+2] = b[ic+2]; 1477 t[ii+3] = b[ic+3]; 1478 t[ii+4] = b[ic+4]; 1479 ii += 5; 1480 } 1481 1482 /* forward solve the U^T */ 1483 idx = 0; 1484 for (i=0; i<n; i++) { 1485 1486 v = aa + 25*diag[i]; 1487 /* multiply by the inverse of the block diagonal */ 1488 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1489 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1490 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1491 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1492 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1493 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1494 v += 25; 1495 1496 vi = aj + diag[i] + 1; 1497 nz = ai[i+1] - diag[i] - 1; 1498 while (nz--) { 1499 oidx = 5*(*vi++); 1500 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1501 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1502 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1503 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1504 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1505 v += 25; 1506 } 1507 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1508 idx += 5; 1509 } 1510 /* backward solve the L^T */ 1511 for (i=n-1; i>=0; i--){ 1512 v = aa + 25*diag[i] - 25; 1513 vi = aj + diag[i] - 1; 1514 nz = diag[i] - ai[i]; 1515 idt = 5*i; 1516 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1517 while (nz--) { 1518 idx = 5*(*vi--); 1519 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1520 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1521 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1522 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1523 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1524 v -= 25; 1525 } 1526 } 1527 1528 /* copy t into x according to permutation */ 1529 ii = 0; 1530 for (i=0; i<n; i++) { 1531 ir = 5*r[i]; 1532 x[ir] = t[ii]; 1533 x[ir+1] = t[ii+1]; 1534 x[ir+2] = t[ii+2]; 1535 x[ir+3] = t[ii+3]; 1536 x[ir+4] = t[ii+4]; 1537 ii += 5; 1538 } 1539 1540 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1541 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1542 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1543 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1544 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1545 PetscFunctionReturn(0); 1546 } 1547 1548 #undef __FUNCT__ 1549 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 1550 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1551 { 1552 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1553 PetscErrorCode ierr; 1554 IS iscol=a->col,isrow=a->row; 1555 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1556 const PetscInt *r,*c,*rout,*cout; 1557 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1558 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1559 const MatScalar *aa=a->a,*v; 1560 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t; 1561 const PetscScalar *b; 1562 1563 PetscFunctionBegin; 1564 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1565 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1566 t = a->solve_work; 1567 1568 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1569 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1570 1571 /* copy b into temp work space according to permutation */ 1572 for(i=0;i<n;i++){ 1573 ii = bs*i; ic = bs*c[i]; 1574 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1575 t[ii+4] = b[ic+4]; 1576 } 1577 1578 /* forward solve the U^T */ 1579 idx = 0; 1580 for (i=0; i<n; i++) { 1581 v = aa + bs2*diag[i]; 1582 /* multiply by the inverse of the block diagonal */ 1583 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1584 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1585 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1586 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1587 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1588 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1589 v -= bs2; 1590 1591 vi = aj + diag[i] - 1; 1592 nz = diag[i] - diag[i+1] - 1; 1593 for(j=0;j>-nz;j--){ 1594 oidx = bs*vi[j]; 1595 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1596 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1597 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1598 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1599 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1600 v -= bs2; 1601 } 1602 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1603 idx += bs; 1604 } 1605 /* backward solve the L^T */ 1606 for (i=n-1; i>=0; i--){ 1607 v = aa + bs2*ai[i]; 1608 vi = aj + ai[i]; 1609 nz = ai[i+1] - ai[i]; 1610 idt = bs*i; 1611 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1612 for(j=0;j<nz;j++){ 1613 idx = bs*vi[j]; 1614 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1615 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1616 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1617 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1618 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1619 v += bs2; 1620 } 1621 } 1622 1623 /* copy t into x according to permutation */ 1624 for(i=0;i<n;i++){ 1625 ii = bs*i; ir = bs*r[i]; 1626 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1627 x[ir+4] = t[ii+4]; 1628 } 1629 1630 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1631 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1632 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1633 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1634 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1635 PetscFunctionReturn(0); 1636 } 1637 1638 #undef __FUNCT__ 1639 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace" 1640 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 1641 { 1642 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1643 IS iscol=a->col,isrow=a->row; 1644 PetscErrorCode ierr; 1645 const PetscInt *r,*c,*rout,*cout; 1646 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1647 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1648 const MatScalar *aa=a->a,*v; 1649 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1650 const PetscScalar *b; 1651 1652 PetscFunctionBegin; 1653 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1654 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1655 t = a->solve_work; 1656 1657 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1658 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1659 1660 /* copy the b into temp work space according to permutation */ 1661 ii = 0; 1662 for (i=0; i<n; i++) { 1663 ic = 6*c[i]; 1664 t[ii] = b[ic]; 1665 t[ii+1] = b[ic+1]; 1666 t[ii+2] = b[ic+2]; 1667 t[ii+3] = b[ic+3]; 1668 t[ii+4] = b[ic+4]; 1669 t[ii+5] = b[ic+5]; 1670 ii += 6; 1671 } 1672 1673 /* forward solve the U^T */ 1674 idx = 0; 1675 for (i=0; i<n; i++) { 1676 1677 v = aa + 36*diag[i]; 1678 /* multiply by the inverse of the block diagonal */ 1679 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1680 x6 = t[5+idx]; 1681 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1682 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1683 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1684 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1685 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1686 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1687 v += 36; 1688 1689 vi = aj + diag[i] + 1; 1690 nz = ai[i+1] - diag[i] - 1; 1691 while (nz--) { 1692 oidx = 6*(*vi++); 1693 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1694 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1695 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1696 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1697 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1698 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1699 v += 36; 1700 } 1701 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1702 t[5+idx] = s6; 1703 idx += 6; 1704 } 1705 /* backward solve the L^T */ 1706 for (i=n-1; i>=0; i--){ 1707 v = aa + 36*diag[i] - 36; 1708 vi = aj + diag[i] - 1; 1709 nz = diag[i] - ai[i]; 1710 idt = 6*i; 1711 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1712 s6 = t[5+idt]; 1713 while (nz--) { 1714 idx = 6*(*vi--); 1715 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1716 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1717 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1718 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1719 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1720 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1721 v -= 36; 1722 } 1723 } 1724 1725 /* copy t into x according to permutation */ 1726 ii = 0; 1727 for (i=0; i<n; i++) { 1728 ir = 6*r[i]; 1729 x[ir] = t[ii]; 1730 x[ir+1] = t[ii+1]; 1731 x[ir+2] = t[ii+2]; 1732 x[ir+3] = t[ii+3]; 1733 x[ir+4] = t[ii+4]; 1734 x[ir+5] = t[ii+5]; 1735 ii += 6; 1736 } 1737 1738 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1739 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1740 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1741 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1742 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1743 PetscFunctionReturn(0); 1744 } 1745 1746 #undef __FUNCT__ 1747 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 1748 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1749 { 1750 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1751 PetscErrorCode ierr; 1752 IS iscol=a->col,isrow=a->row; 1753 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1754 const PetscInt *r,*c,*rout,*cout; 1755 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1756 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1757 const MatScalar *aa=a->a,*v; 1758 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t; 1759 const PetscScalar *b; 1760 1761 PetscFunctionBegin; 1762 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1763 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1764 t = a->solve_work; 1765 1766 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1767 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1768 1769 /* copy b into temp work space according to permutation */ 1770 for(i=0;i<n;i++){ 1771 ii = bs*i; ic = bs*c[i]; 1772 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1773 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 1774 } 1775 1776 /* forward solve the U^T */ 1777 idx = 0; 1778 for (i=0; i<n; i++) { 1779 v = aa + bs2*diag[i]; 1780 /* multiply by the inverse of the block diagonal */ 1781 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1782 x6 = t[5+idx]; 1783 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1784 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1785 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1786 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1787 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1788 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1789 v -= bs2; 1790 1791 vi = aj + diag[i] - 1; 1792 nz = diag[i] - diag[i+1] - 1; 1793 for(j=0;j>-nz;j--){ 1794 oidx = bs*vi[j]; 1795 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1796 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1797 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1798 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1799 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1800 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1801 v -= bs2; 1802 } 1803 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1804 t[5+idx] = s6; 1805 idx += bs; 1806 } 1807 /* backward solve the L^T */ 1808 for (i=n-1; i>=0; i--){ 1809 v = aa + bs2*ai[i]; 1810 vi = aj + ai[i]; 1811 nz = ai[i+1] - ai[i]; 1812 idt = bs*i; 1813 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1814 s6 = t[5+idt]; 1815 for(j=0;j<nz;j++){ 1816 idx = bs*vi[j]; 1817 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1818 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1819 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1820 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1821 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1822 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1823 v += bs2; 1824 } 1825 } 1826 1827 /* copy t into x according to permutation */ 1828 for(i=0;i<n;i++){ 1829 ii = bs*i; ir = bs*r[i]; 1830 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1831 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 1832 } 1833 1834 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1835 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1836 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1837 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1838 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1839 PetscFunctionReturn(0); 1840 } 1841 1842 #undef __FUNCT__ 1843 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace" 1844 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 1845 { 1846 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1847 IS iscol=a->col,isrow=a->row; 1848 PetscErrorCode ierr; 1849 const PetscInt *r,*c,*rout,*cout; 1850 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1851 PetscInt i,nz,idx,idt,ii,ic,ir,oidx; 1852 const MatScalar *aa=a->a,*v; 1853 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1854 const PetscScalar *b; 1855 1856 PetscFunctionBegin; 1857 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1858 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1859 t = a->solve_work; 1860 1861 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1862 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1863 1864 /* copy the b into temp work space according to permutation */ 1865 ii = 0; 1866 for (i=0; i<n; i++) { 1867 ic = 7*c[i]; 1868 t[ii] = b[ic]; 1869 t[ii+1] = b[ic+1]; 1870 t[ii+2] = b[ic+2]; 1871 t[ii+3] = b[ic+3]; 1872 t[ii+4] = b[ic+4]; 1873 t[ii+5] = b[ic+5]; 1874 t[ii+6] = b[ic+6]; 1875 ii += 7; 1876 } 1877 1878 /* forward solve the U^T */ 1879 idx = 0; 1880 for (i=0; i<n; i++) { 1881 1882 v = aa + 49*diag[i]; 1883 /* multiply by the inverse of the block diagonal */ 1884 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1885 x6 = t[5+idx]; x7 = t[6+idx]; 1886 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1887 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1888 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1889 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1890 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1891 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1892 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1893 v += 49; 1894 1895 vi = aj + diag[i] + 1; 1896 nz = ai[i+1] - diag[i] - 1; 1897 while (nz--) { 1898 oidx = 7*(*vi++); 1899 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1900 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1901 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1902 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1903 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1904 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1905 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1906 v += 49; 1907 } 1908 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1909 t[5+idx] = s6;t[6+idx] = s7; 1910 idx += 7; 1911 } 1912 /* backward solve the L^T */ 1913 for (i=n-1; i>=0; i--){ 1914 v = aa + 49*diag[i] - 49; 1915 vi = aj + diag[i] - 1; 1916 nz = diag[i] - ai[i]; 1917 idt = 7*i; 1918 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1919 s6 = t[5+idt];s7 = t[6+idt]; 1920 while (nz--) { 1921 idx = 7*(*vi--); 1922 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1923 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1924 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1925 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1926 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1927 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1928 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1929 v -= 49; 1930 } 1931 } 1932 1933 /* copy t into x according to permutation */ 1934 ii = 0; 1935 for (i=0; i<n; i++) { 1936 ir = 7*r[i]; 1937 x[ir] = t[ii]; 1938 x[ir+1] = t[ii+1]; 1939 x[ir+2] = t[ii+2]; 1940 x[ir+3] = t[ii+3]; 1941 x[ir+4] = t[ii+4]; 1942 x[ir+5] = t[ii+5]; 1943 x[ir+6] = t[ii+6]; 1944 ii += 7; 1945 } 1946 1947 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1948 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1949 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1950 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1951 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1952 PetscFunctionReturn(0); 1953 } 1954 #undef __FUNCT__ 1955 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1956 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1957 { 1958 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1959 PetscErrorCode ierr; 1960 IS iscol=a->col,isrow=a->row; 1961 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1962 const PetscInt *r,*c,*rout,*cout; 1963 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1964 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 1965 const MatScalar *aa=a->a,*v; 1966 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 1967 const PetscScalar *b; 1968 1969 PetscFunctionBegin; 1970 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1971 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1972 t = a->solve_work; 1973 1974 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1975 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1976 1977 /* copy b into temp work space according to permutation */ 1978 for(i=0;i<n;i++){ 1979 ii = bs*i; ic = bs*c[i]; 1980 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1981 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 1982 } 1983 1984 /* forward solve the U^T */ 1985 idx = 0; 1986 for (i=0; i<n; i++) { 1987 v = aa + bs2*diag[i]; 1988 /* multiply by the inverse of the block diagonal */ 1989 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1990 x6 = t[5+idx]; x7 = t[6+idx]; 1991 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1992 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1993 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1994 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1995 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1996 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1997 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1998 v -= bs2; 1999 2000 vi = aj + diag[i] - 1; 2001 nz = diag[i] - diag[i+1] - 1; 2002 for(j=0;j>-nz;j--){ 2003 oidx = bs*vi[j]; 2004 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2005 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2006 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2007 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2008 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2009 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2010 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2011 v -= bs2; 2012 } 2013 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 2014 t[5+idx] = s6; t[6+idx] = s7; 2015 idx += bs; 2016 } 2017 /* backward solve the L^T */ 2018 for (i=n-1; i>=0; i--){ 2019 v = aa + bs2*ai[i]; 2020 vi = aj + ai[i]; 2021 nz = ai[i+1] - ai[i]; 2022 idt = bs*i; 2023 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2024 s6 = t[5+idt]; s7 = t[6+idt]; 2025 for(j=0;j<nz;j++){ 2026 idx = bs*vi[j]; 2027 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2028 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2029 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2030 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2031 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2032 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2033 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2034 v += bs2; 2035 } 2036 } 2037 2038 /* copy t into x according to permutation */ 2039 for(i=0;i<n;i++){ 2040 ii = bs*i; ir = bs*r[i]; 2041 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 2042 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 2043 } 2044 2045 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2046 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2047 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2048 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2049 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2050 PetscFunctionReturn(0); 2051 } 2052 2053 /* ----------------------------------------------------------- */ 2054 #undef __FUNCT__ 2055 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace" 2056 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2057 { 2058 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2059 IS iscol=a->col,isrow=a->row; 2060 PetscErrorCode ierr; 2061 const PetscInt *r,*c,*rout,*cout; 2062 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi; 2063 PetscInt i,nz; 2064 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2065 const MatScalar *aa=a->a,*v; 2066 PetscScalar *x,*s,*t,*ls; 2067 const PetscScalar *b; 2068 2069 PetscFunctionBegin; 2070 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2071 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2072 t = a->solve_work; 2073 2074 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2075 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2076 2077 /* forward solve the lower triangular */ 2078 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2079 for (i=1; i<n; i++) { 2080 v = aa + bs2*ai[i]; 2081 vi = aj + ai[i]; 2082 nz = a->diag[i] - ai[i]; 2083 s = t + bs*i; 2084 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2085 while (nz--) { 2086 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 2087 v += bs2; 2088 } 2089 } 2090 /* backward solve the upper triangular */ 2091 ls = a->solve_work + A->cmap->n; 2092 for (i=n-1; i>=0; i--){ 2093 v = aa + bs2*(a->diag[i] + 1); 2094 vi = aj + a->diag[i] + 1; 2095 nz = ai[i+1] - a->diag[i] - 1; 2096 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2097 while (nz--) { 2098 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 2099 v += bs2; 2100 } 2101 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2102 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2103 } 2104 2105 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2106 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2107 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2108 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2109 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2110 PetscFunctionReturn(0); 2111 } 2112 2113 /* ----------------------------------------------------------- */ 2114 #undef __FUNCT__ 2115 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace" 2116 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx) 2117 { 2118 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2119 IS iscol=a->col,isrow=a->row; 2120 PetscErrorCode ierr; 2121 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2122 PetscInt i,nz,j; 2123 const PetscInt n=a->mbs,bs=A->rmap->bs,bs2=a->bs2; 2124 const MatScalar *aa=a->a,*v; 2125 PetscScalar *x,*t,*ls; 2126 const PetscScalar *b; 2127 PetscFunctionBegin; 2128 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2129 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2130 t = a->solve_work; 2131 2132 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2133 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2134 2135 /* copy the b into temp work space according to permutation */ 2136 for (i=0; i<n; i++) { 2137 for (j=0; j<bs; j++) { 2138 t[i*bs+j] = b[c[i]*bs+j]; 2139 } 2140 } 2141 2142 2143 /* forward solve the upper triangular transpose */ 2144 ls = a->solve_work + A->cmap->n; 2145 for (i=0; i<n; i++){ 2146 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2147 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2148 v = aa + bs2*(a->diag[i] + 1); 2149 vi = aj + a->diag[i] + 1; 2150 nz = ai[i+1] - a->diag[i] - 1; 2151 while (nz--) { 2152 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2153 v += bs2; 2154 } 2155 } 2156 2157 /* backward solve the lower triangular transpose */ 2158 for (i=n-1; i>=0; i--) { 2159 v = aa + bs2*ai[i]; 2160 vi = aj + ai[i]; 2161 nz = a->diag[i] - ai[i]; 2162 while (nz--) { 2163 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2164 v += bs2; 2165 } 2166 } 2167 2168 /* copy t into x according to permutation */ 2169 for (i=0; i<n; i++) { 2170 for (j=0; j<bs; j++) { 2171 x[bs*r[i]+j] = t[bs*i+j]; 2172 } 2173 } 2174 2175 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2176 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2177 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2178 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2179 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2180 PetscFunctionReturn(0); 2181 } 2182 2183 #undef __FUNCT__ 2184 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 2185 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 2186 { 2187 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2188 IS iscol=a->col,isrow=a->row; 2189 PetscErrorCode ierr; 2190 const PetscInt *r,*c,*rout,*cout; 2191 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2192 PetscInt i,j,nz; 2193 const PetscInt bs=A->rmap->bs,bs2=a->bs2; 2194 const MatScalar *aa=a->a,*v; 2195 PetscScalar *x,*t,*ls; 2196 const PetscScalar *b; 2197 2198 PetscFunctionBegin; 2199 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2200 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2201 t = a->solve_work; 2202 2203 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2204 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2205 2206 /* copy the b into temp work space according to permutation */ 2207 for (i=0; i<n; i++) { 2208 for (j=0; j<bs; j++) { 2209 t[i*bs+j] = b[c[i]*bs+j]; 2210 } 2211 } 2212 2213 2214 /* forward solve the upper triangular transpose */ 2215 ls = a->solve_work + A->cmap->n; 2216 for (i=0; i<n; i++){ 2217 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2218 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 2219 v = aa + bs2*(diag[i] - 1); 2220 vi = aj + diag[i] - 1; 2221 nz = diag[i] - diag[i+1] - 1; 2222 for(j=0;j>-nz;j--){ 2223 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2224 v -= bs2; 2225 } 2226 } 2227 2228 /* backward solve the lower triangular transpose */ 2229 for (i=n-1; i>=0; i--) { 2230 v = aa + bs2*ai[i]; 2231 vi = aj + ai[i]; 2232 nz = ai[i+1] - ai[i]; 2233 for(j=0;j<nz;j++){ 2234 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2235 v += bs2; 2236 } 2237 } 2238 2239 /* copy t into x according to permutation */ 2240 for (i=0; i<n; i++) { 2241 for (j=0; j<bs; j++) { 2242 x[bs*r[i]+j] = t[bs*i+j]; 2243 } 2244 } 2245 2246 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2247 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2248 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2249 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2250 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2251 PetscFunctionReturn(0); 2252 } 2253 2254 /* bs = 15 for PFLOTRAN */ 2255 2256 #undef __FUNCT__ 2257 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering" 2258 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering(Mat A,Vec bb,Vec xx) 2259 { 2260 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2261 PetscErrorCode ierr; 2262 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2263 PetscInt i,nz,idx,idt,idc,m; 2264 const MatScalar *aa=a->a,*v; 2265 PetscScalar s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15; 2266 PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15; 2267 PetscScalar *x,*t; 2268 const PetscScalar *b; 2269 2270 PetscFunctionBegin; 2271 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2272 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2273 t = a->solve_work; 2274 2275 /* forward solve the lower triangular */ 2276 idx = 0; 2277 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2278 t[5] = b[5+idx]; t[6] = b[6+idx]; t[7] = b[7+idx]; t[8] = b[8+idx]; t[9] = b[9+idx]; 2279 t[10] = b[10+idx]; t[11] = b[11+idx]; t[12] = b[12+idx]; t[13] = b[13+idx]; t[14] = b[14+idx]; 2280 2281 for (i=1; i<n; i++) { 2282 v = aa + bs2*ai[i]; 2283 vi = aj + ai[i]; 2284 nz = ai[i+1] - ai[i]; 2285 idx = bs*i; 2286 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; s4 = b[3+idx]; s5 = b[4+idx]; 2287 s6 = b[5+idx]; s7 = b[6+idx]; s8 = b[7+idx]; s9 = b[8+idx]; s10 = b[9+idx]; 2288 s11 = b[10+idx]; s12 = b[11+idx]; s13 = b[12+idx]; s14 = b[13+idx]; s15 = b[14+idx]; 2289 for(m=0;m<nz;m++){ 2290 idx = bs*vi[m]; 2291 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2292 x6 = t[5+idx]; x7 = t[6+idx]; x8 = t[7+idx]; x9 = t[8+idx]; x10 = t[9+idx]; 2293 x11 = t[10+idx]; x12 = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx]; 2294 2295 2296 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2297 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2298 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2299 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2300 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2301 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2302 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2303 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2304 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2305 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2306 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2307 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2308 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2309 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2310 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2311 2312 v += bs2; 2313 } 2314 idx = bs*i; 2315 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] = s5; 2316 t[5+idx] = s6; t[6+idx] = s7; t[7+idx] = s8; t[8+idx] = s9; t[9+idx] = s10; 2317 t[10+idx] = s11; t[11+idx] = s12; t[12+idx] = s13; t[13+idx] = s14; t[14+idx] = s15; 2318 2319 } 2320 /* backward solve the upper triangular */ 2321 for (i=n-1; i>=0; i--){ 2322 v = aa + bs2*(adiag[i+1]+1); 2323 vi = aj + adiag[i+1]+1; 2324 nz = adiag[i] - adiag[i+1] - 1; 2325 idt = bs*i; 2326 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2327 s6 = t[5+idt]; s7 = t[6+idt]; s8 = t[7+idt]; s9 = t[8+idt]; s10 = t[9+idt]; 2328 s11 = t[10+idt]; s12 = t[11+idt]; s13 = t[12+idt]; s14 = t[13+idt]; s15 = t[14+idt]; 2329 2330 for(m=0;m<nz;m++){ 2331 idx = bs*vi[m]; 2332 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2333 x6 = t[5+idx]; x7 = t[6+idx]; x8 = t[7+idx]; x9 = t[8+idx]; x10 = t[9+idx]; 2334 x11 = t[10+idx]; x12 = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx]; 2335 2336 s1 -= v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15; 2337 s2 -= v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15; 2338 s3 -= v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15; 2339 s4 -= v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15; 2340 s5 -= v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15; 2341 s6 -= v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15; 2342 s7 -= v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15; 2343 s8 -= v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15; 2344 s9 -= v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15; 2345 s10 -= v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15; 2346 s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15; 2347 s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15; 2348 s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15; 2349 s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15; 2350 s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15; 2351 2352 v += bs2; 2353 } 2354 idc = bs*i; 2355 2356 x[idc] = t[idt] = v[0]*s1 + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7 + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15; 2357 x[1+idc] = t[1+idt] = v[1]*s1 + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7 + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15; 2358 x[2+idc] = t[2+idt] = v[2]*s1 + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7 + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15; 2359 x[3+idc] = t[3+idt] = v[3]*s1 + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7 + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15; 2360 x[4+idc] = t[4+idt] = v[4]*s1 + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7 + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15; 2361 x[5+idc] = t[5+idt] = v[5]*s1 + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7 + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15; 2362 x[6+idc] = t[6+idt] = v[6]*s1 + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7 + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15; 2363 x[7+idc] = t[7+idt] = v[7]*s1 + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7 + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15; 2364 x[8+idc] = t[8+idt] = v[8]*s1 + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7 + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15; 2365 x[9+idc] = t[9+idt] = v[9]*s1 + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7 + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15; 2366 x[10+idc] = t[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15; 2367 x[11+idc] = t[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15; 2368 x[12+idc] = t[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15; 2369 x[13+idc] = t[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15; 2370 x[14+idc] = t[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15; 2371 2372 } 2373 2374 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2375 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2376 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2377 PetscFunctionReturn(0); 2378 } 2379 2380 #undef __FUNCT__ 2381 #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2" 2382 PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx) 2383 { 2384 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2385 PetscErrorCode ierr; 2386 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2; 2387 PetscInt i,k,nz,idx,idt,idc,m; 2388 const MatScalar *aa=a->a,*v; 2389 PetscScalar s[15]; 2390 PetscScalar *x,*t; 2391 const PetscScalar *b; 2392 2393 PetscFunctionBegin; 2394 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2395 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2396 t = a->solve_work; 2397 2398 /* forward solve the lower triangular */ 2399 idx = 0; 2400 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2401 t[5] = b[5+idx]; t[6] = b[6+idx]; t[7] = b[7+idx]; t[8] = b[8+idx]; t[9] = b[9+idx]; 2402 t[10] = b[10+idx]; t[11] = b[11+idx]; t[12] = b[12+idx]; t[13] = b[13+idx]; t[14] = b[14+idx]; 2403 2404 for (i=1; i<n; i++) { 2405 v = aa + bs2*ai[i]; 2406 vi = aj + ai[i]; 2407 nz = ai[i+1] - ai[i]; 2408 idx = bs*i; 2409 s[0] = b[idx]; s[1] = b[1+idx]; s[2] = b[2+idx]; s[3] = b[3+idx]; s[4] = b[4+idx]; 2410 s[5] = b[5+idx]; s[6] = b[6+idx]; s[7] = b[7+idx]; s[8] = b[8+idx]; s[9] = b[9+idx]; 2411 s[10] = b[10+idx]; s[11] = b[11+idx]; s[12] = b[12+idx]; s[13] = b[13+idx]; s[14] = b[14+idx]; 2412 for(m=0;m<nz;m++){ 2413 idx = bs*vi[m]; 2414 2415 for(k=0;k<15;k++){ 2416 s[0] -= v[0]*t[k+idx]; 2417 s[1] -= v[1]*t[k+idx]; 2418 s[2] -= v[2]*t[k+idx]; 2419 s[3] -= v[3]*t[k+idx]; 2420 s[4] -= v[4]*t[k+idx]; 2421 s[5] -= v[5]*t[k+idx]; 2422 s[6] -= v[6]*t[k+idx]; 2423 s[7] -= v[7]*t[k+idx]; 2424 s[8] -= v[8]*t[k+idx]; 2425 s[9] -= v[9]*t[k+idx]; 2426 s[10] -= v[10]*t[k+idx]; 2427 s[11] -= v[11]*t[k+idx]; 2428 s[12] -= v[12]*t[k+idx]; 2429 s[13] -= v[13]*t[k+idx]; 2430 s[14] -= v[14]*t[k+idx]; 2431 v += 15; 2432 } 2433 } 2434 idx = bs*i; 2435 t[idx] = s[0]; t[1+idx] = s[1]; t[2+idx] = s[2]; t[3+idx] = s[3]; t[4+idx] = s[4]; 2436 t[5+idx] = s[5]; t[6+idx] = s[6]; t[7+idx] = s[7]; t[8+idx] = s[8]; t[9+idx] = s[9]; 2437 t[10+idx] = s[10]; t[11+idx] = s[11]; t[12+idx] = s[12]; t[13+idx] = s[13]; t[14+idx] = s[14]; 2438 2439 } 2440 /* backward solve the upper triangular */ 2441 for (i=n-1; i>=0; i--){ 2442 v = aa + bs2*(adiag[i+1]+1); 2443 vi = aj + adiag[i+1]+1; 2444 nz = adiag[i] - adiag[i+1] - 1; 2445 idt = bs*i; 2446 s[0] = t[idt]; s[1] = t[1+idt]; s[2] = t[2+idt]; s[3] = t[3+idt]; s[4] = t[4+idt]; 2447 s[5] = t[5+idt]; s[6] = t[6+idt]; s[7] = t[7+idt]; s[8] = t[8+idt]; s[9] = t[9+idt]; 2448 s[10] = t[10+idt]; s[11] = t[11+idt]; s[12] = t[12+idt]; s[13] = t[13+idt]; s[14] = t[14+idt]; 2449 2450 for(m=0;m<nz;m++){ 2451 idx = bs*vi[m]; 2452 for(k=0;k<15;k++){ 2453 s[0] -= v[0]*t[k+idx]; 2454 s[1] -= v[1]*t[k+idx]; 2455 s[2] -= v[2]*t[k+idx]; 2456 s[3] -= v[3]*t[k+idx]; 2457 s[4] -= v[4]*t[k+idx]; 2458 s[5] -= v[5]*t[k+idx]; 2459 s[6] -= v[6]*t[k+idx]; 2460 s[7] -= v[7]*t[k+idx]; 2461 s[8] -= v[8]*t[k+idx]; 2462 s[9] -= v[9]*t[k+idx]; 2463 s[10] -= v[10]*t[k+idx]; 2464 s[11] -= v[11]*t[k+idx]; 2465 s[12] -= v[12]*t[k+idx]; 2466 s[13] -= v[13]*t[k+idx]; 2467 s[14] -= v[14]*t[k+idx]; 2468 v += 15; 2469 } 2470 } 2471 idc = bs*i; 2472 2473 for(k=0;k<15;k++){ 2474 t[idt] += v[0]*s[k]; 2475 t[1+idt] += v[1]*s[k]; 2476 t[2+idt] += v[2]*s[k]; 2477 t[3+idt] += v[3]*s[k]; 2478 t[4+idt] += v[4]*s[k]; 2479 t[5+idt] += v[5]*s[k]; 2480 t[6+idt] += v[6]*s[k]; 2481 t[7+idt] += v[7]*s[k]; 2482 t[8+idt] += v[8]*s[k]; 2483 t[9+idt] += v[9]*s[k]; 2484 t[10+idt] += v[10]*s[k]; 2485 t[11+idt] += v[11]*s[k]; 2486 t[12+idt] += v[12]*s[k]; 2487 t[13+idt] += v[13]*s[k]; 2488 t[14+idt] += v[14]*s[k]; 2489 v += 15; 2490 } 2491 x[idc] = t[idt]; x[1+idc] = t[1+idt]; x[2+idc] = t[2+idt]; x[3+idc] = t[3+idt]; x[4+idc] = t[4+idt]; 2492 x[5+idc] = t[5+idt]; x[6+idc] = t[6+idt]; x[7+idc] = t[7+idt]; x[8+idc] = t[8+idt]; x[9+idc] = t[9+idt]; 2493 x[10+idc] = t[10+idt]; x[11+idc] = t[11+idt]; x[12+idc] = t[12+idt]; x[13+idc] = t[13+idt]; x[14+idc] = t[14+idt]; 2494 } 2495 2496 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2497 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2498 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2499 PetscFunctionReturn(0); 2500 } 2501 2502 2503 #undef __FUNCT__ 2504 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace" 2505 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx) 2506 { 2507 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2508 IS iscol=a->col,isrow=a->row; 2509 PetscErrorCode ierr; 2510 const PetscInt *r,*c,*ai=a->i,*aj=a->j; 2511 const PetscInt *rout,*cout,*diag = a->diag,*vi,n=a->mbs; 2512 PetscInt i,nz,idx,idt,idc; 2513 const MatScalar *aa=a->a,*v; 2514 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2515 const PetscScalar *b; 2516 2517 PetscFunctionBegin; 2518 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2519 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2520 t = a->solve_work; 2521 2522 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2523 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2524 2525 /* forward solve the lower triangular */ 2526 idx = 7*(*r++); 2527 t[0] = b[idx]; t[1] = b[1+idx]; 2528 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2529 t[5] = b[5+idx]; t[6] = b[6+idx]; 2530 2531 for (i=1; i<n; i++) { 2532 v = aa + 49*ai[i]; 2533 vi = aj + ai[i]; 2534 nz = diag[i] - ai[i]; 2535 idx = 7*(*r++); 2536 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2537 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2538 while (nz--) { 2539 idx = 7*(*vi++); 2540 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2541 x4 = t[3+idx];x5 = t[4+idx]; 2542 x6 = t[5+idx];x7 = t[6+idx]; 2543 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2544 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2545 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2546 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2547 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2548 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2549 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2550 v += 49; 2551 } 2552 idx = 7*i; 2553 t[idx] = s1;t[1+idx] = s2; 2554 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2555 t[5+idx] = s6;t[6+idx] = s7; 2556 } 2557 /* backward solve the upper triangular */ 2558 for (i=n-1; i>=0; i--){ 2559 v = aa + 49*diag[i] + 49; 2560 vi = aj + diag[i] + 1; 2561 nz = ai[i+1] - diag[i] - 1; 2562 idt = 7*i; 2563 s1 = t[idt]; s2 = t[1+idt]; 2564 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2565 s6 = t[5+idt];s7 = t[6+idt]; 2566 while (nz--) { 2567 idx = 7*(*vi++); 2568 x1 = t[idx]; x2 = t[1+idx]; 2569 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2570 x6 = t[5+idx]; x7 = t[6+idx]; 2571 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2572 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2573 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2574 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2575 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2576 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2577 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2578 v += 49; 2579 } 2580 idc = 7*(*c--); 2581 v = aa + 49*diag[i]; 2582 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2583 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2584 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2585 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2586 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2587 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2588 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2589 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2590 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2591 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2592 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2593 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2594 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2595 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2596 } 2597 2598 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2599 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2600 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2601 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2602 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2603 PetscFunctionReturn(0); 2604 } 2605 2606 #undef __FUNCT__ 2607 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 2608 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2609 { 2610 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2611 IS iscol=a->col,isrow=a->row; 2612 PetscErrorCode ierr; 2613 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag; 2614 const PetscInt n=a->mbs,*rout,*cout,*vi; 2615 PetscInt i,nz,idx,idt,idc,m; 2616 const MatScalar *aa=a->a,*v; 2617 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t; 2618 const PetscScalar *b; 2619 2620 PetscFunctionBegin; 2621 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2622 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2623 t = a->solve_work; 2624 2625 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2626 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2627 2628 /* forward solve the lower triangular */ 2629 idx = 7*r[0]; 2630 t[0] = b[idx]; t[1] = b[1+idx]; 2631 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2632 t[5] = b[5+idx]; t[6] = b[6+idx]; 2633 2634 for (i=1; i<n; i++) { 2635 v = aa + 49*ai[i]; 2636 vi = aj + ai[i]; 2637 nz = ai[i+1] - ai[i]; 2638 idx = 7*r[i]; 2639 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2640 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2641 for(m=0;m<nz;m++){ 2642 idx = 7*vi[m]; 2643 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2644 x4 = t[3+idx];x5 = t[4+idx]; 2645 x6 = t[5+idx];x7 = t[6+idx]; 2646 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2647 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2648 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2649 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2650 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2651 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2652 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2653 v += 49; 2654 } 2655 idx = 7*i; 2656 t[idx] = s1;t[1+idx] = s2; 2657 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2658 t[5+idx] = s6;t[6+idx] = s7; 2659 } 2660 /* backward solve the upper triangular */ 2661 for (i=n-1; i>=0; i--){ 2662 v = aa + 49*(adiag[i+1]+1); 2663 vi = aj + adiag[i+1]+1; 2664 nz = adiag[i] - adiag[i+1] - 1; 2665 idt = 7*i; 2666 s1 = t[idt]; s2 = t[1+idt]; 2667 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2668 s6 = t[5+idt];s7 = t[6+idt]; 2669 for(m=0;m<nz;m++){ 2670 idx = 7*vi[m]; 2671 x1 = t[idx]; x2 = t[1+idx]; 2672 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2673 x6 = t[5+idx]; x7 = t[6+idx]; 2674 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2675 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2676 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2677 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2678 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2679 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2680 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2681 v += 49; 2682 } 2683 idc = 7*c[i]; 2684 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2685 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2686 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2687 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2688 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2689 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2690 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2691 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2692 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2693 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2694 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2695 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2696 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2697 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2698 } 2699 2700 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2701 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2702 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2703 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2704 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2705 PetscFunctionReturn(0); 2706 } 2707 2708 #undef __FUNCT__ 2709 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace" 2710 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 2711 { 2712 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2713 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2714 PetscErrorCode ierr; 2715 PetscInt i,nz,idx,idt,jdx; 2716 const MatScalar *aa=a->a,*v; 2717 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2718 const PetscScalar *b; 2719 2720 PetscFunctionBegin; 2721 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2722 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2723 /* forward solve the lower triangular */ 2724 idx = 0; 2725 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2726 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2727 x[6] = b[6+idx]; 2728 for (i=1; i<n; i++) { 2729 v = aa + 49*ai[i]; 2730 vi = aj + ai[i]; 2731 nz = diag[i] - ai[i]; 2732 idx = 7*i; 2733 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2734 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2735 s7 = b[6+idx]; 2736 while (nz--) { 2737 jdx = 7*(*vi++); 2738 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2739 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2740 x7 = x[6+jdx]; 2741 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2742 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2743 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2744 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2745 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2746 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2747 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2748 v += 49; 2749 } 2750 x[idx] = s1; 2751 x[1+idx] = s2; 2752 x[2+idx] = s3; 2753 x[3+idx] = s4; 2754 x[4+idx] = s5; 2755 x[5+idx] = s6; 2756 x[6+idx] = s7; 2757 } 2758 /* backward solve the upper triangular */ 2759 for (i=n-1; i>=0; i--){ 2760 v = aa + 49*diag[i] + 49; 2761 vi = aj + diag[i] + 1; 2762 nz = ai[i+1] - diag[i] - 1; 2763 idt = 7*i; 2764 s1 = x[idt]; s2 = x[1+idt]; 2765 s3 = x[2+idt]; s4 = x[3+idt]; 2766 s5 = x[4+idt]; s6 = x[5+idt]; 2767 s7 = x[6+idt]; 2768 while (nz--) { 2769 idx = 7*(*vi++); 2770 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2771 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2772 x7 = x[6+idx]; 2773 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2774 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2775 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2776 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2777 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2778 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2779 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2780 v += 49; 2781 } 2782 v = aa + 49*diag[i]; 2783 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2784 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2785 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2786 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2787 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2788 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2789 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2790 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2791 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2792 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2793 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2794 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2795 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2796 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2797 } 2798 2799 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2800 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2801 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2802 PetscFunctionReturn(0); 2803 } 2804 2805 #undef __FUNCT__ 2806 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 2807 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 2808 { 2809 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2810 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 2811 PetscErrorCode ierr; 2812 PetscInt i,k,nz,idx,jdx,idt; 2813 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2814 const MatScalar *aa=a->a,*v; 2815 PetscScalar *x; 2816 const PetscScalar *b; 2817 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2818 2819 PetscFunctionBegin; 2820 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2821 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2822 /* forward solve the lower triangular */ 2823 idx = 0; 2824 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2825 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 2826 for (i=1; i<n; i++) { 2827 v = aa + bs2*ai[i]; 2828 vi = aj + ai[i]; 2829 nz = ai[i+1] - ai[i]; 2830 idx = bs*i; 2831 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2832 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2833 for(k=0;k<nz;k++) { 2834 jdx = bs*vi[k]; 2835 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2836 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 2837 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2838 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2839 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2840 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2841 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2842 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2843 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2844 v += bs2; 2845 } 2846 2847 x[idx] = s1; 2848 x[1+idx] = s2; 2849 x[2+idx] = s3; 2850 x[3+idx] = s4; 2851 x[4+idx] = s5; 2852 x[5+idx] = s6; 2853 x[6+idx] = s7; 2854 } 2855 2856 /* backward solve the upper triangular */ 2857 for (i=n-1; i>=0; i--){ 2858 v = aa + bs2*(adiag[i+1]+1); 2859 vi = aj + adiag[i+1]+1; 2860 nz = adiag[i] - adiag[i+1]-1; 2861 idt = bs*i; 2862 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2863 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 2864 for(k=0;k<nz;k++) { 2865 idx = bs*vi[k]; 2866 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2867 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 2868 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2869 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2870 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2871 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2872 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2873 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2874 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2875 v += bs2; 2876 } 2877 /* x = inv_diagonal*x */ 2878 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2879 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2880 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2881 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2882 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2883 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2884 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2885 } 2886 2887 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2888 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2889 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2890 PetscFunctionReturn(0); 2891 } 2892 2893 #undef __FUNCT__ 2894 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace" 2895 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx) 2896 { 2897 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2898 IS iscol=a->col,isrow=a->row; 2899 PetscErrorCode ierr; 2900 const PetscInt *r,*c,*rout,*cout; 2901 const PetscInt *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 2902 PetscInt i,nz,idx,idt,idc; 2903 const MatScalar *aa=a->a,*v; 2904 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2905 const PetscScalar *b; 2906 2907 PetscFunctionBegin; 2908 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2909 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2910 t = a->solve_work; 2911 2912 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2913 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2914 2915 /* forward solve the lower triangular */ 2916 idx = 6*(*r++); 2917 t[0] = b[idx]; t[1] = b[1+idx]; 2918 t[2] = b[2+idx]; t[3] = b[3+idx]; 2919 t[4] = b[4+idx]; t[5] = b[5+idx]; 2920 for (i=1; i<n; i++) { 2921 v = aa + 36*ai[i]; 2922 vi = aj + ai[i]; 2923 nz = diag[i] - ai[i]; 2924 idx = 6*(*r++); 2925 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2926 s5 = b[4+idx]; s6 = b[5+idx]; 2927 while (nz--) { 2928 idx = 6*(*vi++); 2929 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2930 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2931 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2932 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2933 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2934 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2935 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2936 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2937 v += 36; 2938 } 2939 idx = 6*i; 2940 t[idx] = s1;t[1+idx] = s2; 2941 t[2+idx] = s3;t[3+idx] = s4; 2942 t[4+idx] = s5;t[5+idx] = s6; 2943 } 2944 /* backward solve the upper triangular */ 2945 for (i=n-1; i>=0; i--){ 2946 v = aa + 36*diag[i] + 36; 2947 vi = aj + diag[i] + 1; 2948 nz = ai[i+1] - diag[i] - 1; 2949 idt = 6*i; 2950 s1 = t[idt]; s2 = t[1+idt]; 2951 s3 = t[2+idt];s4 = t[3+idt]; 2952 s5 = t[4+idt];s6 = t[5+idt]; 2953 while (nz--) { 2954 idx = 6*(*vi++); 2955 x1 = t[idx]; x2 = t[1+idx]; 2956 x3 = t[2+idx]; x4 = t[3+idx]; 2957 x5 = t[4+idx]; x6 = t[5+idx]; 2958 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2959 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2960 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2961 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2962 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2963 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2964 v += 36; 2965 } 2966 idc = 6*(*c--); 2967 v = aa + 36*diag[i]; 2968 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2969 v[18]*s4+v[24]*s5+v[30]*s6; 2970 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2971 v[19]*s4+v[25]*s5+v[31]*s6; 2972 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2973 v[20]*s4+v[26]*s5+v[32]*s6; 2974 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2975 v[21]*s4+v[27]*s5+v[33]*s6; 2976 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2977 v[22]*s4+v[28]*s5+v[34]*s6; 2978 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2979 v[23]*s4+v[29]*s5+v[35]*s6; 2980 } 2981 2982 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2983 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2984 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2985 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2986 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2987 PetscFunctionReturn(0); 2988 } 2989 2990 #undef __FUNCT__ 2991 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 2992 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 2993 { 2994 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2995 IS iscol=a->col,isrow=a->row; 2996 PetscErrorCode ierr; 2997 const PetscInt *r,*c,*rout,*cout; 2998 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 2999 PetscInt i,nz,idx,idt,idc,m; 3000 const MatScalar *aa=a->a,*v; 3001 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 3002 const PetscScalar *b; 3003 3004 PetscFunctionBegin; 3005 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3006 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3007 t = a->solve_work; 3008 3009 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3010 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3011 3012 /* forward solve the lower triangular */ 3013 idx = 6*r[0]; 3014 t[0] = b[idx]; t[1] = b[1+idx]; 3015 t[2] = b[2+idx]; t[3] = b[3+idx]; 3016 t[4] = b[4+idx]; t[5] = b[5+idx]; 3017 for (i=1; i<n; i++) { 3018 v = aa + 36*ai[i]; 3019 vi = aj + ai[i]; 3020 nz = ai[i+1] - ai[i]; 3021 idx = 6*r[i]; 3022 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3023 s5 = b[4+idx]; s6 = b[5+idx]; 3024 for(m=0;m<nz;m++){ 3025 idx = 6*vi[m]; 3026 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3027 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 3028 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3029 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3030 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3031 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3032 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3033 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3034 v += 36; 3035 } 3036 idx = 6*i; 3037 t[idx] = s1;t[1+idx] = s2; 3038 t[2+idx] = s3;t[3+idx] = s4; 3039 t[4+idx] = s5;t[5+idx] = s6; 3040 } 3041 /* backward solve the upper triangular */ 3042 for (i=n-1; i>=0; i--){ 3043 v = aa + 36*(adiag[i+1]+1); 3044 vi = aj + adiag[i+1]+1; 3045 nz = adiag[i] - adiag[i+1] - 1; 3046 idt = 6*i; 3047 s1 = t[idt]; s2 = t[1+idt]; 3048 s3 = t[2+idt];s4 = t[3+idt]; 3049 s5 = t[4+idt];s6 = t[5+idt]; 3050 for(m=0;m<nz;m++){ 3051 idx = 6*vi[m]; 3052 x1 = t[idx]; x2 = t[1+idx]; 3053 x3 = t[2+idx]; x4 = t[3+idx]; 3054 x5 = t[4+idx]; x6 = t[5+idx]; 3055 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3056 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3057 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3058 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3059 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3060 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3061 v += 36; 3062 } 3063 idc = 6*c[i]; 3064 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 3065 v[18]*s4+v[24]*s5+v[30]*s6; 3066 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 3067 v[19]*s4+v[25]*s5+v[31]*s6; 3068 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 3069 v[20]*s4+v[26]*s5+v[32]*s6; 3070 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 3071 v[21]*s4+v[27]*s5+v[33]*s6; 3072 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 3073 v[22]*s4+v[28]*s5+v[34]*s6; 3074 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 3075 v[23]*s4+v[29]*s5+v[35]*s6; 3076 } 3077 3078 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3079 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3080 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3081 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3082 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3083 PetscFunctionReturn(0); 3084 } 3085 3086 #undef __FUNCT__ 3087 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace" 3088 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3089 { 3090 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3091 PetscInt i,nz,idx,idt,jdx; 3092 PetscErrorCode ierr; 3093 const PetscInt *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j; 3094 const MatScalar *aa=a->a,*v; 3095 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3096 const PetscScalar *b; 3097 3098 PetscFunctionBegin; 3099 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3100 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3101 /* forward solve the lower triangular */ 3102 idx = 0; 3103 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 3104 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 3105 for (i=1; i<n; i++) { 3106 v = aa + 36*ai[i]; 3107 vi = aj + ai[i]; 3108 nz = diag[i] - ai[i]; 3109 idx = 6*i; 3110 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3111 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 3112 while (nz--) { 3113 jdx = 6*(*vi++); 3114 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 3115 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 3116 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3117 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3118 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3119 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3120 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3121 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3122 v += 36; 3123 } 3124 x[idx] = s1; 3125 x[1+idx] = s2; 3126 x[2+idx] = s3; 3127 x[3+idx] = s4; 3128 x[4+idx] = s5; 3129 x[5+idx] = s6; 3130 } 3131 /* backward solve the upper triangular */ 3132 for (i=n-1; i>=0; i--){ 3133 v = aa + 36*diag[i] + 36; 3134 vi = aj + diag[i] + 1; 3135 nz = ai[i+1] - diag[i] - 1; 3136 idt = 6*i; 3137 s1 = x[idt]; s2 = x[1+idt]; 3138 s3 = x[2+idt]; s4 = x[3+idt]; 3139 s5 = x[4+idt]; s6 = x[5+idt]; 3140 while (nz--) { 3141 idx = 6*(*vi++); 3142 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 3143 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 3144 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3145 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 3146 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3147 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3148 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3149 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3150 v += 36; 3151 } 3152 v = aa + 36*diag[i]; 3153 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3154 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3155 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3156 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3157 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3158 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3159 } 3160 3161 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3162 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3163 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 3164 PetscFunctionReturn(0); 3165 } 3166 3167 #undef __FUNCT__ 3168 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 3169 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 3170 { 3171 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3172 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3173 PetscErrorCode ierr; 3174 PetscInt i,k,nz,idx,jdx,idt; 3175 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3176 const MatScalar *aa=a->a,*v; 3177 PetscScalar *x; 3178 const PetscScalar *b; 3179 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 3180 3181 PetscFunctionBegin; 3182 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3183 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3184 /* forward solve the lower triangular */ 3185 idx = 0; 3186 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3187 x[4] = b[4+idx];x[5] = b[5+idx]; 3188 for (i=1; i<n; i++) { 3189 v = aa + bs2*ai[i]; 3190 vi = aj + ai[i]; 3191 nz = ai[i+1] - ai[i]; 3192 idx = bs*i; 3193 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3194 s5 = b[4+idx];s6 = b[5+idx]; 3195 for(k=0;k<nz;k++){ 3196 jdx = bs*vi[k]; 3197 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3198 x5 = x[4+jdx]; x6 = x[5+jdx]; 3199 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3200 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3201 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3202 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3203 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3204 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3205 v += bs2; 3206 } 3207 3208 x[idx] = s1; 3209 x[1+idx] = s2; 3210 x[2+idx] = s3; 3211 x[3+idx] = s4; 3212 x[4+idx] = s5; 3213 x[5+idx] = s6; 3214 } 3215 3216 /* backward solve the upper triangular */ 3217 for (i=n-1; i>=0; i--){ 3218 v = aa + bs2*(adiag[i+1]+1); 3219 vi = aj + adiag[i+1]+1; 3220 nz = adiag[i] - adiag[i+1]-1; 3221 idt = bs*i; 3222 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3223 s5 = x[4+idt];s6 = x[5+idt]; 3224 for(k=0;k<nz;k++){ 3225 idx = bs*vi[k]; 3226 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3227 x5 = x[4+idx];x6 = x[5+idx]; 3228 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 3229 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 3230 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 3231 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 3232 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 3233 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 3234 v += bs2; 3235 } 3236 /* x = inv_diagonal*x */ 3237 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 3238 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 3239 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 3240 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 3241 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 3242 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 3243 } 3244 3245 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3246 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3247 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3248 PetscFunctionReturn(0); 3249 } 3250 3251 #undef __FUNCT__ 3252 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace" 3253 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx) 3254 { 3255 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3256 IS iscol=a->col,isrow=a->row; 3257 PetscErrorCode ierr; 3258 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 3259 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3260 PetscInt i,nz,idx,idt,idc; 3261 const MatScalar *aa=a->a,*v; 3262 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3263 const PetscScalar *b; 3264 3265 PetscFunctionBegin; 3266 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3267 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3268 t = a->solve_work; 3269 3270 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3271 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3272 3273 /* forward solve the lower triangular */ 3274 idx = 5*(*r++); 3275 t[0] = b[idx]; t[1] = b[1+idx]; 3276 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3277 for (i=1; i<n; i++) { 3278 v = aa + 25*ai[i]; 3279 vi = aj + ai[i]; 3280 nz = diag[i] - ai[i]; 3281 idx = 5*(*r++); 3282 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3283 s5 = b[4+idx]; 3284 while (nz--) { 3285 idx = 5*(*vi++); 3286 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3287 x4 = t[3+idx];x5 = t[4+idx]; 3288 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3289 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3290 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3291 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3292 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3293 v += 25; 3294 } 3295 idx = 5*i; 3296 t[idx] = s1;t[1+idx] = s2; 3297 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3298 } 3299 /* backward solve the upper triangular */ 3300 for (i=n-1; i>=0; i--){ 3301 v = aa + 25*diag[i] + 25; 3302 vi = aj + diag[i] + 1; 3303 nz = ai[i+1] - diag[i] - 1; 3304 idt = 5*i; 3305 s1 = t[idt]; s2 = t[1+idt]; 3306 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3307 while (nz--) { 3308 idx = 5*(*vi++); 3309 x1 = t[idx]; x2 = t[1+idx]; 3310 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3311 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3312 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3313 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3314 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3315 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3316 v += 25; 3317 } 3318 idc = 5*(*c--); 3319 v = aa + 25*diag[i]; 3320 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3321 v[15]*s4+v[20]*s5; 3322 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3323 v[16]*s4+v[21]*s5; 3324 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3325 v[17]*s4+v[22]*s5; 3326 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3327 v[18]*s4+v[23]*s5; 3328 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3329 v[19]*s4+v[24]*s5; 3330 } 3331 3332 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3333 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3334 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3335 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3336 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3337 PetscFunctionReturn(0); 3338 } 3339 3340 #undef __FUNCT__ 3341 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 3342 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 3343 { 3344 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3345 IS iscol=a->col,isrow=a->row; 3346 PetscErrorCode ierr; 3347 const PetscInt *r,*c,*rout,*cout; 3348 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3349 PetscInt i,nz,idx,idt,idc,m; 3350 const MatScalar *aa=a->a,*v; 3351 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3352 const PetscScalar *b; 3353 3354 PetscFunctionBegin; 3355 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3356 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3357 t = a->solve_work; 3358 3359 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3360 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3361 3362 /* forward solve the lower triangular */ 3363 idx = 5*r[0]; 3364 t[0] = b[idx]; t[1] = b[1+idx]; 3365 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3366 for (i=1; i<n; i++) { 3367 v = aa + 25*ai[i]; 3368 vi = aj + ai[i]; 3369 nz = ai[i+1] - ai[i]; 3370 idx = 5*r[i]; 3371 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3372 s5 = b[4+idx]; 3373 for(m=0;m<nz;m++){ 3374 idx = 5*vi[m]; 3375 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3376 x4 = t[3+idx];x5 = t[4+idx]; 3377 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3378 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3379 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3380 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3381 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3382 v += 25; 3383 } 3384 idx = 5*i; 3385 t[idx] = s1;t[1+idx] = s2; 3386 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3387 } 3388 /* backward solve the upper triangular */ 3389 for (i=n-1; i>=0; i--){ 3390 v = aa + 25*(adiag[i+1]+1); 3391 vi = aj + adiag[i+1]+1; 3392 nz = adiag[i] - adiag[i+1] - 1; 3393 idt = 5*i; 3394 s1 = t[idt]; s2 = t[1+idt]; 3395 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3396 for(m=0;m<nz;m++){ 3397 idx = 5*vi[m]; 3398 x1 = t[idx]; x2 = t[1+idx]; 3399 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3400 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3401 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3402 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3403 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3404 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3405 v += 25; 3406 } 3407 idc = 5*c[i]; 3408 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3409 v[15]*s4+v[20]*s5; 3410 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3411 v[16]*s4+v[21]*s5; 3412 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3413 v[17]*s4+v[22]*s5; 3414 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3415 v[18]*s4+v[23]*s5; 3416 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3417 v[19]*s4+v[24]*s5; 3418 } 3419 3420 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3421 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3422 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3423 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3424 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3425 PetscFunctionReturn(0); 3426 } 3427 3428 #undef __FUNCT__ 3429 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace" 3430 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 3431 { 3432 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3433 const PetscInt *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3434 PetscInt i,nz,idx,idt,jdx; 3435 PetscErrorCode ierr; 3436 const MatScalar *aa=a->a,*v; 3437 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3438 const PetscScalar *b; 3439 3440 PetscFunctionBegin; 3441 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3442 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3443 /* forward solve the lower triangular */ 3444 idx = 0; 3445 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3446 for (i=1; i<n; i++) { 3447 v = aa + 25*ai[i]; 3448 vi = aj + ai[i]; 3449 nz = diag[i] - ai[i]; 3450 idx = 5*i; 3451 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3452 while (nz--) { 3453 jdx = 5*(*vi++); 3454 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3455 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3456 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3457 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3458 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3459 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3460 v += 25; 3461 } 3462 x[idx] = s1; 3463 x[1+idx] = s2; 3464 x[2+idx] = s3; 3465 x[3+idx] = s4; 3466 x[4+idx] = s5; 3467 } 3468 /* backward solve the upper triangular */ 3469 for (i=n-1; i>=0; i--){ 3470 v = aa + 25*diag[i] + 25; 3471 vi = aj + diag[i] + 1; 3472 nz = ai[i+1] - diag[i] - 1; 3473 idt = 5*i; 3474 s1 = x[idt]; s2 = x[1+idt]; 3475 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3476 while (nz--) { 3477 idx = 5*(*vi++); 3478 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3479 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3480 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3481 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3482 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3483 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3484 v += 25; 3485 } 3486 v = aa + 25*diag[i]; 3487 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3488 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3489 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3490 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3491 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3492 } 3493 3494 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3495 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3496 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3497 PetscFunctionReturn(0); 3498 } 3499 3500 #undef __FUNCT__ 3501 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 3502 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 3503 { 3504 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3505 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3506 PetscInt i,k,nz,idx,idt,jdx; 3507 PetscErrorCode ierr; 3508 const MatScalar *aa=a->a,*v; 3509 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3510 const PetscScalar *b; 3511 3512 PetscFunctionBegin; 3513 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3514 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3515 /* forward solve the lower triangular */ 3516 idx = 0; 3517 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3518 for (i=1; i<n; i++) { 3519 v = aa + 25*ai[i]; 3520 vi = aj + ai[i]; 3521 nz = ai[i+1] - ai[i]; 3522 idx = 5*i; 3523 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3524 for(k=0;k<nz;k++) { 3525 jdx = 5*vi[k]; 3526 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3527 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3528 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3529 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3530 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3531 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3532 v += 25; 3533 } 3534 x[idx] = s1; 3535 x[1+idx] = s2; 3536 x[2+idx] = s3; 3537 x[3+idx] = s4; 3538 x[4+idx] = s5; 3539 } 3540 3541 /* backward solve the upper triangular */ 3542 for (i=n-1; i>=0; i--){ 3543 v = aa + 25*(adiag[i+1]+1); 3544 vi = aj + adiag[i+1]+1; 3545 nz = adiag[i] - adiag[i+1]-1; 3546 idt = 5*i; 3547 s1 = x[idt]; s2 = x[1+idt]; 3548 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3549 for(k=0;k<nz;k++){ 3550 idx = 5*vi[k]; 3551 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3552 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3553 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3554 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3555 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3556 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3557 v += 25; 3558 } 3559 /* x = inv_diagonal*x */ 3560 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3561 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3562 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3563 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3564 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3565 } 3566 3567 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3568 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3569 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3570 PetscFunctionReturn(0); 3571 } 3572 3573 #undef __FUNCT__ 3574 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace" 3575 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx) 3576 { 3577 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3578 IS iscol=a->col,isrow=a->row; 3579 PetscErrorCode ierr; 3580 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3581 PetscInt i,nz,idx,idt,idc; 3582 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3583 const MatScalar *aa=a->a,*v; 3584 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3585 const PetscScalar *b; 3586 3587 PetscFunctionBegin; 3588 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3589 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3590 t = a->solve_work; 3591 3592 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3593 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3594 3595 /* forward solve the lower triangular */ 3596 idx = 4*(*r++); 3597 t[0] = b[idx]; t[1] = b[1+idx]; 3598 t[2] = b[2+idx]; t[3] = b[3+idx]; 3599 for (i=1; i<n; i++) { 3600 v = aa + 16*ai[i]; 3601 vi = aj + ai[i]; 3602 nz = diag[i] - ai[i]; 3603 idx = 4*(*r++); 3604 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3605 while (nz--) { 3606 idx = 4*(*vi++); 3607 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3608 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3609 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3610 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3611 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3612 v += 16; 3613 } 3614 idx = 4*i; 3615 t[idx] = s1;t[1+idx] = s2; 3616 t[2+idx] = s3;t[3+idx] = s4; 3617 } 3618 /* backward solve the upper triangular */ 3619 for (i=n-1; i>=0; i--){ 3620 v = aa + 16*diag[i] + 16; 3621 vi = aj + diag[i] + 1; 3622 nz = ai[i+1] - diag[i] - 1; 3623 idt = 4*i; 3624 s1 = t[idt]; s2 = t[1+idt]; 3625 s3 = t[2+idt];s4 = t[3+idt]; 3626 while (nz--) { 3627 idx = 4*(*vi++); 3628 x1 = t[idx]; x2 = t[1+idx]; 3629 x3 = t[2+idx]; x4 = t[3+idx]; 3630 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3631 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3632 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3633 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3634 v += 16; 3635 } 3636 idc = 4*(*c--); 3637 v = aa + 16*diag[i]; 3638 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3639 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3640 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3641 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3642 } 3643 3644 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3645 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3646 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3647 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3648 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3649 PetscFunctionReturn(0); 3650 } 3651 3652 #undef __FUNCT__ 3653 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 3654 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 3655 { 3656 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3657 IS iscol=a->col,isrow=a->row; 3658 PetscErrorCode ierr; 3659 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 3660 PetscInt i,nz,idx,idt,idc,m; 3661 const PetscInt *r,*c,*rout,*cout; 3662 const MatScalar *aa=a->a,*v; 3663 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3664 const PetscScalar *b; 3665 3666 PetscFunctionBegin; 3667 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3668 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3669 t = a->solve_work; 3670 3671 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3672 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3673 3674 /* forward solve the lower triangular */ 3675 idx = 4*r[0]; 3676 t[0] = b[idx]; t[1] = b[1+idx]; 3677 t[2] = b[2+idx]; t[3] = b[3+idx]; 3678 for (i=1; i<n; i++) { 3679 v = aa + 16*ai[i]; 3680 vi = aj + ai[i]; 3681 nz = ai[i+1] - ai[i]; 3682 idx = 4*r[i]; 3683 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3684 for(m=0;m<nz;m++){ 3685 idx = 4*vi[m]; 3686 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3687 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3688 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3689 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3690 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3691 v += 16; 3692 } 3693 idx = 4*i; 3694 t[idx] = s1;t[1+idx] = s2; 3695 t[2+idx] = s3;t[3+idx] = s4; 3696 } 3697 /* backward solve the upper triangular */ 3698 for (i=n-1; i>=0; i--){ 3699 v = aa + 16*(adiag[i+1]+1); 3700 vi = aj + adiag[i+1]+1; 3701 nz = adiag[i] - adiag[i+1] - 1; 3702 idt = 4*i; 3703 s1 = t[idt]; s2 = t[1+idt]; 3704 s3 = t[2+idt];s4 = t[3+idt]; 3705 for(m=0;m<nz;m++){ 3706 idx = 4*vi[m]; 3707 x1 = t[idx]; x2 = t[1+idx]; 3708 x3 = t[2+idx]; x4 = t[3+idx]; 3709 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3710 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3711 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3712 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3713 v += 16; 3714 } 3715 idc = 4*c[i]; 3716 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3717 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3718 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3719 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3720 } 3721 3722 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3723 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3724 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3725 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3726 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3727 PetscFunctionReturn(0); 3728 } 3729 3730 #undef __FUNCT__ 3731 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3732 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3733 { 3734 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3735 IS iscol=a->col,isrow=a->row; 3736 PetscErrorCode ierr; 3737 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 3738 PetscInt i,nz,idx,idt,idc; 3739 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3740 const MatScalar *aa=a->a,*v; 3741 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3742 PetscScalar *x; 3743 const PetscScalar *b; 3744 3745 PetscFunctionBegin; 3746 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3747 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3748 t = (MatScalar *)a->solve_work; 3749 3750 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3751 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3752 3753 /* forward solve the lower triangular */ 3754 idx = 4*(*r++); 3755 t[0] = (MatScalar)b[idx]; 3756 t[1] = (MatScalar)b[1+idx]; 3757 t[2] = (MatScalar)b[2+idx]; 3758 t[3] = (MatScalar)b[3+idx]; 3759 for (i=1; i<n; i++) { 3760 v = aa + 16*ai[i]; 3761 vi = aj + ai[i]; 3762 nz = diag[i] - ai[i]; 3763 idx = 4*(*r++); 3764 s1 = (MatScalar)b[idx]; 3765 s2 = (MatScalar)b[1+idx]; 3766 s3 = (MatScalar)b[2+idx]; 3767 s4 = (MatScalar)b[3+idx]; 3768 while (nz--) { 3769 idx = 4*(*vi++); 3770 x1 = t[idx]; 3771 x2 = t[1+idx]; 3772 x3 = t[2+idx]; 3773 x4 = t[3+idx]; 3774 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3775 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3776 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3777 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3778 v += 16; 3779 } 3780 idx = 4*i; 3781 t[idx] = s1; 3782 t[1+idx] = s2; 3783 t[2+idx] = s3; 3784 t[3+idx] = s4; 3785 } 3786 /* backward solve the upper triangular */ 3787 for (i=n-1; i>=0; i--){ 3788 v = aa + 16*diag[i] + 16; 3789 vi = aj + diag[i] + 1; 3790 nz = ai[i+1] - diag[i] - 1; 3791 idt = 4*i; 3792 s1 = t[idt]; 3793 s2 = t[1+idt]; 3794 s3 = t[2+idt]; 3795 s4 = t[3+idt]; 3796 while (nz--) { 3797 idx = 4*(*vi++); 3798 x1 = t[idx]; 3799 x2 = t[1+idx]; 3800 x3 = t[2+idx]; 3801 x4 = t[3+idx]; 3802 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3803 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3804 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3805 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3806 v += 16; 3807 } 3808 idc = 4*(*c--); 3809 v = aa + 16*diag[i]; 3810 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3811 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3812 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3813 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3814 x[idc] = (PetscScalar)t[idt]; 3815 x[1+idc] = (PetscScalar)t[1+idt]; 3816 x[2+idc] = (PetscScalar)t[2+idt]; 3817 x[3+idc] = (PetscScalar)t[3+idt]; 3818 } 3819 3820 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3821 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3822 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3823 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3824 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3825 PetscFunctionReturn(0); 3826 } 3827 3828 #if defined (PETSC_HAVE_SSE) 3829 3830 #include PETSC_HAVE_SSE 3831 3832 #undef __FUNCT__ 3833 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3834 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3835 { 3836 /* 3837 Note: This code uses demotion of double 3838 to float when performing the mixed-mode computation. 3839 This may not be numerically reasonable for all applications. 3840 */ 3841 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3842 IS iscol=a->col,isrow=a->row; 3843 PetscErrorCode ierr; 3844 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3845 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3846 MatScalar *aa=a->a,*v; 3847 PetscScalar *x,*b,*t; 3848 3849 /* Make space in temp stack for 16 Byte Aligned arrays */ 3850 float ssealignedspace[11],*tmps,*tmpx; 3851 unsigned long offset; 3852 3853 PetscFunctionBegin; 3854 SSE_SCOPE_BEGIN; 3855 3856 offset = (unsigned long)ssealignedspace % 16; 3857 if (offset) offset = (16 - offset)/4; 3858 tmps = &ssealignedspace[offset]; 3859 tmpx = &ssealignedspace[offset+4]; 3860 PREFETCH_NTA(aa+16*ai[1]); 3861 3862 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3863 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3864 t = a->solve_work; 3865 3866 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3867 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3868 3869 /* forward solve the lower triangular */ 3870 idx = 4*(*r++); 3871 t[0] = b[idx]; t[1] = b[1+idx]; 3872 t[2] = b[2+idx]; t[3] = b[3+idx]; 3873 v = aa + 16*ai[1]; 3874 3875 for (i=1; i<n;) { 3876 PREFETCH_NTA(&v[8]); 3877 vi = aj + ai[i]; 3878 nz = diag[i] - ai[i]; 3879 idx = 4*(*r++); 3880 3881 /* Demote sum from double to float */ 3882 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3883 LOAD_PS(tmps,XMM7); 3884 3885 while (nz--) { 3886 PREFETCH_NTA(&v[16]); 3887 idx = 4*(*vi++); 3888 3889 /* Demote solution (so far) from double to float */ 3890 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3891 3892 /* 4x4 Matrix-Vector product with negative accumulation: */ 3893 SSE_INLINE_BEGIN_2(tmpx,v) 3894 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3895 3896 /* First Column */ 3897 SSE_COPY_PS(XMM0,XMM6) 3898 SSE_SHUFFLE(XMM0,XMM0,0x00) 3899 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3900 SSE_SUB_PS(XMM7,XMM0) 3901 3902 /* Second Column */ 3903 SSE_COPY_PS(XMM1,XMM6) 3904 SSE_SHUFFLE(XMM1,XMM1,0x55) 3905 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3906 SSE_SUB_PS(XMM7,XMM1) 3907 3908 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3909 3910 /* Third Column */ 3911 SSE_COPY_PS(XMM2,XMM6) 3912 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3913 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3914 SSE_SUB_PS(XMM7,XMM2) 3915 3916 /* Fourth Column */ 3917 SSE_COPY_PS(XMM3,XMM6) 3918 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3919 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3920 SSE_SUB_PS(XMM7,XMM3) 3921 SSE_INLINE_END_2 3922 3923 v += 16; 3924 } 3925 idx = 4*i; 3926 v = aa + 16*ai[++i]; 3927 PREFETCH_NTA(v); 3928 STORE_PS(tmps,XMM7); 3929 3930 /* Promote result from float to double */ 3931 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3932 } 3933 /* backward solve the upper triangular */ 3934 idt = 4*(n-1); 3935 ai16 = 16*diag[n-1]; 3936 v = aa + ai16 + 16; 3937 for (i=n-1; i>=0;){ 3938 PREFETCH_NTA(&v[8]); 3939 vi = aj + diag[i] + 1; 3940 nz = ai[i+1] - diag[i] - 1; 3941 3942 /* Demote accumulator from double to float */ 3943 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3944 LOAD_PS(tmps,XMM7); 3945 3946 while (nz--) { 3947 PREFETCH_NTA(&v[16]); 3948 idx = 4*(*vi++); 3949 3950 /* Demote solution (so far) from double to float */ 3951 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 3952 3953 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3954 SSE_INLINE_BEGIN_2(tmpx,v) 3955 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3956 3957 /* First Column */ 3958 SSE_COPY_PS(XMM0,XMM6) 3959 SSE_SHUFFLE(XMM0,XMM0,0x00) 3960 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3961 SSE_SUB_PS(XMM7,XMM0) 3962 3963 /* Second Column */ 3964 SSE_COPY_PS(XMM1,XMM6) 3965 SSE_SHUFFLE(XMM1,XMM1,0x55) 3966 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3967 SSE_SUB_PS(XMM7,XMM1) 3968 3969 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3970 3971 /* Third Column */ 3972 SSE_COPY_PS(XMM2,XMM6) 3973 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3974 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3975 SSE_SUB_PS(XMM7,XMM2) 3976 3977 /* Fourth Column */ 3978 SSE_COPY_PS(XMM3,XMM6) 3979 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3980 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3981 SSE_SUB_PS(XMM7,XMM3) 3982 SSE_INLINE_END_2 3983 v += 16; 3984 } 3985 v = aa + ai16; 3986 ai16 = 16*diag[--i]; 3987 PREFETCH_NTA(aa+ai16+16); 3988 /* 3989 Scale the result by the diagonal 4x4 block, 3990 which was inverted as part of the factorization 3991 */ 3992 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 3993 /* First Column */ 3994 SSE_COPY_PS(XMM0,XMM7) 3995 SSE_SHUFFLE(XMM0,XMM0,0x00) 3996 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3997 3998 /* Second Column */ 3999 SSE_COPY_PS(XMM1,XMM7) 4000 SSE_SHUFFLE(XMM1,XMM1,0x55) 4001 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4002 SSE_ADD_PS(XMM0,XMM1) 4003 4004 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4005 4006 /* Third Column */ 4007 SSE_COPY_PS(XMM2,XMM7) 4008 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4009 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4010 SSE_ADD_PS(XMM0,XMM2) 4011 4012 /* Fourth Column */ 4013 SSE_COPY_PS(XMM3,XMM7) 4014 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4015 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4016 SSE_ADD_PS(XMM0,XMM3) 4017 4018 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4019 SSE_INLINE_END_3 4020 4021 /* Promote solution from float to double */ 4022 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 4023 4024 /* Apply reordering to t and stream into x. */ 4025 /* This way, x doesn't pollute the cache. */ 4026 /* Be careful with size: 2 doubles = 4 floats! */ 4027 idc = 4*(*c--); 4028 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 4029 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 4030 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 4031 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 4032 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 4033 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 4034 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 4035 SSE_INLINE_END_2 4036 v = aa + ai16 + 16; 4037 idt -= 4; 4038 } 4039 4040 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4041 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4042 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4043 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4044 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4045 SSE_SCOPE_END; 4046 PetscFunctionReturn(0); 4047 } 4048 4049 #endif 4050 4051 4052 /* 4053 Special case where the matrix was ILU(0) factored in the natural 4054 ordering. This eliminates the need for the column and row permutation. 4055 */ 4056 #undef __FUNCT__ 4057 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace" 4058 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4059 { 4060 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4061 PetscInt n=a->mbs; 4062 const PetscInt *ai=a->i,*aj=a->j; 4063 PetscErrorCode ierr; 4064 const PetscInt *diag = a->diag; 4065 const MatScalar *aa=a->a; 4066 PetscScalar *x; 4067 const PetscScalar *b; 4068 4069 PetscFunctionBegin; 4070 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4071 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4072 4073 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 4074 { 4075 static PetscScalar w[2000]; /* very BAD need to fix */ 4076 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 4077 } 4078 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 4079 { 4080 static PetscScalar w[2000]; /* very BAD need to fix */ 4081 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 4082 } 4083 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 4084 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 4085 #else 4086 { 4087 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4088 const MatScalar *v; 4089 PetscInt jdx,idt,idx,nz,i,ai16; 4090 const PetscInt *vi; 4091 4092 /* forward solve the lower triangular */ 4093 idx = 0; 4094 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 4095 for (i=1; i<n; i++) { 4096 v = aa + 16*ai[i]; 4097 vi = aj + ai[i]; 4098 nz = diag[i] - ai[i]; 4099 idx += 4; 4100 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4101 while (nz--) { 4102 jdx = 4*(*vi++); 4103 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 4104 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4105 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4106 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4107 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4108 v += 16; 4109 } 4110 x[idx] = s1; 4111 x[1+idx] = s2; 4112 x[2+idx] = s3; 4113 x[3+idx] = s4; 4114 } 4115 /* backward solve the upper triangular */ 4116 idt = 4*(n-1); 4117 for (i=n-1; i>=0; i--){ 4118 ai16 = 16*diag[i]; 4119 v = aa + ai16 + 16; 4120 vi = aj + diag[i] + 1; 4121 nz = ai[i+1] - diag[i] - 1; 4122 s1 = x[idt]; s2 = x[1+idt]; 4123 s3 = x[2+idt];s4 = x[3+idt]; 4124 while (nz--) { 4125 idx = 4*(*vi++); 4126 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 4127 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4128 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4129 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4130 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4131 v += 16; 4132 } 4133 v = aa + ai16; 4134 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4135 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 4136 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4137 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4138 idt -= 4; 4139 } 4140 } 4141 #endif 4142 4143 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4144 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4145 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4146 PetscFunctionReturn(0); 4147 } 4148 4149 #undef __FUNCT__ 4150 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 4151 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 4152 { 4153 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4154 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4155 PetscInt i,k,nz,idx,jdx,idt; 4156 PetscErrorCode ierr; 4157 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4158 const MatScalar *aa=a->a,*v; 4159 PetscScalar *x; 4160 const PetscScalar *b; 4161 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 4162 4163 PetscFunctionBegin; 4164 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4165 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4166 /* forward solve the lower triangular */ 4167 idx = 0; 4168 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 4169 for (i=1; i<n; i++) { 4170 v = aa + bs2*ai[i]; 4171 vi = aj + ai[i]; 4172 nz = ai[i+1] - ai[i]; 4173 idx = bs*i; 4174 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 4175 for(k=0;k<nz;k++) { 4176 jdx = bs*vi[k]; 4177 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 4178 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4179 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4180 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4181 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4182 4183 v += bs2; 4184 } 4185 4186 x[idx] = s1; 4187 x[1+idx] = s2; 4188 x[2+idx] = s3; 4189 x[3+idx] = s4; 4190 } 4191 4192 /* backward solve the upper triangular */ 4193 for (i=n-1; i>=0; i--){ 4194 v = aa + bs2*(adiag[i+1]+1); 4195 vi = aj + adiag[i+1]+1; 4196 nz = adiag[i] - adiag[i+1]-1; 4197 idt = bs*i; 4198 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 4199 4200 for(k=0;k<nz;k++){ 4201 idx = bs*vi[k]; 4202 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 4203 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4204 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4205 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4206 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4207 4208 v += bs2; 4209 } 4210 /* x = inv_diagonal*x */ 4211 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 4212 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 4213 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 4214 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 4215 4216 } 4217 4218 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4219 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4220 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4221 PetscFunctionReturn(0); 4222 } 4223 4224 #undef __FUNCT__ 4225 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 4226 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 4227 { 4228 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4229 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag; 4230 PetscErrorCode ierr; 4231 const MatScalar *aa=a->a; 4232 const PetscScalar *b; 4233 PetscScalar *x; 4234 4235 PetscFunctionBegin; 4236 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4237 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4238 4239 { 4240 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 4241 const MatScalar *v; 4242 MatScalar *t=(MatScalar *)x; 4243 PetscInt jdx,idt,idx,nz,i,ai16; 4244 const PetscInt *vi; 4245 4246 /* forward solve the lower triangular */ 4247 idx = 0; 4248 t[0] = (MatScalar)b[0]; 4249 t[1] = (MatScalar)b[1]; 4250 t[2] = (MatScalar)b[2]; 4251 t[3] = (MatScalar)b[3]; 4252 for (i=1; i<n; i++) { 4253 v = aa + 16*ai[i]; 4254 vi = aj + ai[i]; 4255 nz = diag[i] - ai[i]; 4256 idx += 4; 4257 s1 = (MatScalar)b[idx]; 4258 s2 = (MatScalar)b[1+idx]; 4259 s3 = (MatScalar)b[2+idx]; 4260 s4 = (MatScalar)b[3+idx]; 4261 while (nz--) { 4262 jdx = 4*(*vi++); 4263 x1 = t[jdx]; 4264 x2 = t[1+jdx]; 4265 x3 = t[2+jdx]; 4266 x4 = t[3+jdx]; 4267 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4268 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4269 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4270 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4271 v += 16; 4272 } 4273 t[idx] = s1; 4274 t[1+idx] = s2; 4275 t[2+idx] = s3; 4276 t[3+idx] = s4; 4277 } 4278 /* backward solve the upper triangular */ 4279 idt = 4*(n-1); 4280 for (i=n-1; i>=0; i--){ 4281 ai16 = 16*diag[i]; 4282 v = aa + ai16 + 16; 4283 vi = aj + diag[i] + 1; 4284 nz = ai[i+1] - diag[i] - 1; 4285 s1 = t[idt]; 4286 s2 = t[1+idt]; 4287 s3 = t[2+idt]; 4288 s4 = t[3+idt]; 4289 while (nz--) { 4290 idx = 4*(*vi++); 4291 x1 = (MatScalar)x[idx]; 4292 x2 = (MatScalar)x[1+idx]; 4293 x3 = (MatScalar)x[2+idx]; 4294 x4 = (MatScalar)x[3+idx]; 4295 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4296 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4297 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4298 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4299 v += 16; 4300 } 4301 v = aa + ai16; 4302 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4303 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4304 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4305 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4306 idt -= 4; 4307 } 4308 } 4309 4310 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4311 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4312 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4313 PetscFunctionReturn(0); 4314 } 4315 4316 #if defined (PETSC_HAVE_SSE) 4317 4318 #include PETSC_HAVE_SSE 4319 #undef __FUNCT__ 4320 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4321 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 4322 { 4323 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4324 unsigned short *aj=(unsigned short *)a->j; 4325 PetscErrorCode ierr; 4326 int *ai=a->i,n=a->mbs,*diag = a->diag; 4327 MatScalar *aa=a->a; 4328 PetscScalar *x,*b; 4329 4330 PetscFunctionBegin; 4331 SSE_SCOPE_BEGIN; 4332 /* 4333 Note: This code currently uses demotion of double 4334 to float when performing the mixed-mode computation. 4335 This may not be numerically reasonable for all applications. 4336 */ 4337 PREFETCH_NTA(aa+16*ai[1]); 4338 4339 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4340 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4341 { 4342 /* x will first be computed in single precision then promoted inplace to double */ 4343 MatScalar *v,*t=(MatScalar *)x; 4344 int nz,i,idt,ai16; 4345 unsigned int jdx,idx; 4346 unsigned short *vi; 4347 /* Forward solve the lower triangular factor. */ 4348 4349 /* First block is the identity. */ 4350 idx = 0; 4351 CONVERT_DOUBLE4_FLOAT4(t,b); 4352 v = aa + 16*((unsigned int)ai[1]); 4353 4354 for (i=1; i<n;) { 4355 PREFETCH_NTA(&v[8]); 4356 vi = aj + ai[i]; 4357 nz = diag[i] - ai[i]; 4358 idx += 4; 4359 4360 /* Demote RHS from double to float. */ 4361 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4362 LOAD_PS(&t[idx],XMM7); 4363 4364 while (nz--) { 4365 PREFETCH_NTA(&v[16]); 4366 jdx = 4*((unsigned int)(*vi++)); 4367 4368 /* 4x4 Matrix-Vector product with negative accumulation: */ 4369 SSE_INLINE_BEGIN_2(&t[jdx],v) 4370 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4371 4372 /* First Column */ 4373 SSE_COPY_PS(XMM0,XMM6) 4374 SSE_SHUFFLE(XMM0,XMM0,0x00) 4375 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4376 SSE_SUB_PS(XMM7,XMM0) 4377 4378 /* Second Column */ 4379 SSE_COPY_PS(XMM1,XMM6) 4380 SSE_SHUFFLE(XMM1,XMM1,0x55) 4381 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4382 SSE_SUB_PS(XMM7,XMM1) 4383 4384 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4385 4386 /* Third Column */ 4387 SSE_COPY_PS(XMM2,XMM6) 4388 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4389 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4390 SSE_SUB_PS(XMM7,XMM2) 4391 4392 /* Fourth Column */ 4393 SSE_COPY_PS(XMM3,XMM6) 4394 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4395 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4396 SSE_SUB_PS(XMM7,XMM3) 4397 SSE_INLINE_END_2 4398 4399 v += 16; 4400 } 4401 v = aa + 16*ai[++i]; 4402 PREFETCH_NTA(v); 4403 STORE_PS(&t[idx],XMM7); 4404 } 4405 4406 /* Backward solve the upper triangular factor.*/ 4407 4408 idt = 4*(n-1); 4409 ai16 = 16*diag[n-1]; 4410 v = aa + ai16 + 16; 4411 for (i=n-1; i>=0;){ 4412 PREFETCH_NTA(&v[8]); 4413 vi = aj + diag[i] + 1; 4414 nz = ai[i+1] - diag[i] - 1; 4415 4416 LOAD_PS(&t[idt],XMM7); 4417 4418 while (nz--) { 4419 PREFETCH_NTA(&v[16]); 4420 idx = 4*((unsigned int)(*vi++)); 4421 4422 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4423 SSE_INLINE_BEGIN_2(&t[idx],v) 4424 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4425 4426 /* First Column */ 4427 SSE_COPY_PS(XMM0,XMM6) 4428 SSE_SHUFFLE(XMM0,XMM0,0x00) 4429 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4430 SSE_SUB_PS(XMM7,XMM0) 4431 4432 /* Second Column */ 4433 SSE_COPY_PS(XMM1,XMM6) 4434 SSE_SHUFFLE(XMM1,XMM1,0x55) 4435 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4436 SSE_SUB_PS(XMM7,XMM1) 4437 4438 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4439 4440 /* Third Column */ 4441 SSE_COPY_PS(XMM2,XMM6) 4442 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4443 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4444 SSE_SUB_PS(XMM7,XMM2) 4445 4446 /* Fourth Column */ 4447 SSE_COPY_PS(XMM3,XMM6) 4448 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4449 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4450 SSE_SUB_PS(XMM7,XMM3) 4451 SSE_INLINE_END_2 4452 v += 16; 4453 } 4454 v = aa + ai16; 4455 ai16 = 16*diag[--i]; 4456 PREFETCH_NTA(aa+ai16+16); 4457 /* 4458 Scale the result by the diagonal 4x4 block, 4459 which was inverted as part of the factorization 4460 */ 4461 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4462 /* First Column */ 4463 SSE_COPY_PS(XMM0,XMM7) 4464 SSE_SHUFFLE(XMM0,XMM0,0x00) 4465 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4466 4467 /* Second Column */ 4468 SSE_COPY_PS(XMM1,XMM7) 4469 SSE_SHUFFLE(XMM1,XMM1,0x55) 4470 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4471 SSE_ADD_PS(XMM0,XMM1) 4472 4473 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4474 4475 /* Third Column */ 4476 SSE_COPY_PS(XMM2,XMM7) 4477 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4478 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4479 SSE_ADD_PS(XMM0,XMM2) 4480 4481 /* Fourth Column */ 4482 SSE_COPY_PS(XMM3,XMM7) 4483 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4484 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4485 SSE_ADD_PS(XMM0,XMM3) 4486 4487 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4488 SSE_INLINE_END_3 4489 4490 v = aa + ai16 + 16; 4491 idt -= 4; 4492 } 4493 4494 /* Convert t from single precision back to double precision (inplace)*/ 4495 idt = 4*(n-1); 4496 for (i=n-1;i>=0;i--) { 4497 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4498 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4499 PetscScalar *xtemp=&x[idt]; 4500 MatScalar *ttemp=&t[idt]; 4501 xtemp[3] = (PetscScalar)ttemp[3]; 4502 xtemp[2] = (PetscScalar)ttemp[2]; 4503 xtemp[1] = (PetscScalar)ttemp[1]; 4504 xtemp[0] = (PetscScalar)ttemp[0]; 4505 idt -= 4; 4506 } 4507 4508 } /* End of artificial scope. */ 4509 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4510 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4511 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4512 SSE_SCOPE_END; 4513 PetscFunctionReturn(0); 4514 } 4515 4516 #undef __FUNCT__ 4517 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4518 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 4519 { 4520 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4521 int *aj=a->j; 4522 PetscErrorCode ierr; 4523 int *ai=a->i,n=a->mbs,*diag = a->diag; 4524 MatScalar *aa=a->a; 4525 PetscScalar *x,*b; 4526 4527 PetscFunctionBegin; 4528 SSE_SCOPE_BEGIN; 4529 /* 4530 Note: This code currently uses demotion of double 4531 to float when performing the mixed-mode computation. 4532 This may not be numerically reasonable for all applications. 4533 */ 4534 PREFETCH_NTA(aa+16*ai[1]); 4535 4536 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4537 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4538 { 4539 /* x will first be computed in single precision then promoted inplace to double */ 4540 MatScalar *v,*t=(MatScalar *)x; 4541 int nz,i,idt,ai16; 4542 int jdx,idx; 4543 int *vi; 4544 /* Forward solve the lower triangular factor. */ 4545 4546 /* First block is the identity. */ 4547 idx = 0; 4548 CONVERT_DOUBLE4_FLOAT4(t,b); 4549 v = aa + 16*ai[1]; 4550 4551 for (i=1; i<n;) { 4552 PREFETCH_NTA(&v[8]); 4553 vi = aj + ai[i]; 4554 nz = diag[i] - ai[i]; 4555 idx += 4; 4556 4557 /* Demote RHS from double to float. */ 4558 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4559 LOAD_PS(&t[idx],XMM7); 4560 4561 while (nz--) { 4562 PREFETCH_NTA(&v[16]); 4563 jdx = 4*(*vi++); 4564 /* jdx = *vi++; */ 4565 4566 /* 4x4 Matrix-Vector product with negative accumulation: */ 4567 SSE_INLINE_BEGIN_2(&t[jdx],v) 4568 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4569 4570 /* First Column */ 4571 SSE_COPY_PS(XMM0,XMM6) 4572 SSE_SHUFFLE(XMM0,XMM0,0x00) 4573 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4574 SSE_SUB_PS(XMM7,XMM0) 4575 4576 /* Second Column */ 4577 SSE_COPY_PS(XMM1,XMM6) 4578 SSE_SHUFFLE(XMM1,XMM1,0x55) 4579 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4580 SSE_SUB_PS(XMM7,XMM1) 4581 4582 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4583 4584 /* Third Column */ 4585 SSE_COPY_PS(XMM2,XMM6) 4586 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4587 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4588 SSE_SUB_PS(XMM7,XMM2) 4589 4590 /* Fourth Column */ 4591 SSE_COPY_PS(XMM3,XMM6) 4592 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4593 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4594 SSE_SUB_PS(XMM7,XMM3) 4595 SSE_INLINE_END_2 4596 4597 v += 16; 4598 } 4599 v = aa + 16*ai[++i]; 4600 PREFETCH_NTA(v); 4601 STORE_PS(&t[idx],XMM7); 4602 } 4603 4604 /* Backward solve the upper triangular factor.*/ 4605 4606 idt = 4*(n-1); 4607 ai16 = 16*diag[n-1]; 4608 v = aa + ai16 + 16; 4609 for (i=n-1; i>=0;){ 4610 PREFETCH_NTA(&v[8]); 4611 vi = aj + diag[i] + 1; 4612 nz = ai[i+1] - diag[i] - 1; 4613 4614 LOAD_PS(&t[idt],XMM7); 4615 4616 while (nz--) { 4617 PREFETCH_NTA(&v[16]); 4618 idx = 4*(*vi++); 4619 /* idx = *vi++; */ 4620 4621 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4622 SSE_INLINE_BEGIN_2(&t[idx],v) 4623 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4624 4625 /* First Column */ 4626 SSE_COPY_PS(XMM0,XMM6) 4627 SSE_SHUFFLE(XMM0,XMM0,0x00) 4628 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4629 SSE_SUB_PS(XMM7,XMM0) 4630 4631 /* Second Column */ 4632 SSE_COPY_PS(XMM1,XMM6) 4633 SSE_SHUFFLE(XMM1,XMM1,0x55) 4634 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4635 SSE_SUB_PS(XMM7,XMM1) 4636 4637 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4638 4639 /* Third Column */ 4640 SSE_COPY_PS(XMM2,XMM6) 4641 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4642 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4643 SSE_SUB_PS(XMM7,XMM2) 4644 4645 /* Fourth Column */ 4646 SSE_COPY_PS(XMM3,XMM6) 4647 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4648 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4649 SSE_SUB_PS(XMM7,XMM3) 4650 SSE_INLINE_END_2 4651 v += 16; 4652 } 4653 v = aa + ai16; 4654 ai16 = 16*diag[--i]; 4655 PREFETCH_NTA(aa+ai16+16); 4656 /* 4657 Scale the result by the diagonal 4x4 block, 4658 which was inverted as part of the factorization 4659 */ 4660 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4661 /* First Column */ 4662 SSE_COPY_PS(XMM0,XMM7) 4663 SSE_SHUFFLE(XMM0,XMM0,0x00) 4664 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4665 4666 /* Second Column */ 4667 SSE_COPY_PS(XMM1,XMM7) 4668 SSE_SHUFFLE(XMM1,XMM1,0x55) 4669 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4670 SSE_ADD_PS(XMM0,XMM1) 4671 4672 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4673 4674 /* Third Column */ 4675 SSE_COPY_PS(XMM2,XMM7) 4676 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4677 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4678 SSE_ADD_PS(XMM0,XMM2) 4679 4680 /* Fourth Column */ 4681 SSE_COPY_PS(XMM3,XMM7) 4682 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4683 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4684 SSE_ADD_PS(XMM0,XMM3) 4685 4686 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4687 SSE_INLINE_END_3 4688 4689 v = aa + ai16 + 16; 4690 idt -= 4; 4691 } 4692 4693 /* Convert t from single precision back to double precision (inplace)*/ 4694 idt = 4*(n-1); 4695 for (i=n-1;i>=0;i--) { 4696 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4697 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4698 PetscScalar *xtemp=&x[idt]; 4699 MatScalar *ttemp=&t[idt]; 4700 xtemp[3] = (PetscScalar)ttemp[3]; 4701 xtemp[2] = (PetscScalar)ttemp[2]; 4702 xtemp[1] = (PetscScalar)ttemp[1]; 4703 xtemp[0] = (PetscScalar)ttemp[0]; 4704 idt -= 4; 4705 } 4706 4707 } /* End of artificial scope. */ 4708 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4709 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4710 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4711 SSE_SCOPE_END; 4712 PetscFunctionReturn(0); 4713 } 4714 4715 #endif 4716 4717 #undef __FUNCT__ 4718 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace" 4719 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx) 4720 { 4721 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4722 IS iscol=a->col,isrow=a->row; 4723 PetscErrorCode ierr; 4724 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 4725 PetscInt i,nz,idx,idt,idc; 4726 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4727 const MatScalar *aa=a->a,*v; 4728 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4729 const PetscScalar *b; 4730 4731 PetscFunctionBegin; 4732 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4733 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4734 t = a->solve_work; 4735 4736 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4737 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4738 4739 /* forward solve the lower triangular */ 4740 idx = 3*(*r++); 4741 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4742 for (i=1; i<n; i++) { 4743 v = aa + 9*ai[i]; 4744 vi = aj + ai[i]; 4745 nz = diag[i] - ai[i]; 4746 idx = 3*(*r++); 4747 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4748 while (nz--) { 4749 idx = 3*(*vi++); 4750 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4751 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4752 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4753 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4754 v += 9; 4755 } 4756 idx = 3*i; 4757 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4758 } 4759 /* backward solve the upper triangular */ 4760 for (i=n-1; i>=0; i--){ 4761 v = aa + 9*diag[i] + 9; 4762 vi = aj + diag[i] + 1; 4763 nz = ai[i+1] - diag[i] - 1; 4764 idt = 3*i; 4765 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4766 while (nz--) { 4767 idx = 3*(*vi++); 4768 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4769 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4770 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4771 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4772 v += 9; 4773 } 4774 idc = 3*(*c--); 4775 v = aa + 9*diag[i]; 4776 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4777 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4778 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4779 } 4780 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4781 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4782 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4783 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4784 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4785 PetscFunctionReturn(0); 4786 } 4787 4788 #undef __FUNCT__ 4789 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4790 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 4791 { 4792 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4793 IS iscol=a->col,isrow=a->row; 4794 PetscErrorCode ierr; 4795 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4796 PetscInt i,nz,idx,idt,idc,m; 4797 const PetscInt *r,*c,*rout,*cout; 4798 const MatScalar *aa=a->a,*v; 4799 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4800 const PetscScalar *b; 4801 4802 PetscFunctionBegin; 4803 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4804 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4805 t = a->solve_work; 4806 4807 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4808 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4809 4810 /* forward solve the lower triangular */ 4811 idx = 3*r[0]; 4812 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4813 for (i=1; i<n; i++) { 4814 v = aa + 9*ai[i]; 4815 vi = aj + ai[i]; 4816 nz = ai[i+1] - ai[i]; 4817 idx = 3*r[i]; 4818 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4819 for(m=0;m<nz;m++){ 4820 idx = 3*vi[m]; 4821 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4822 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4823 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4824 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4825 v += 9; 4826 } 4827 idx = 3*i; 4828 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4829 } 4830 /* backward solve the upper triangular */ 4831 for (i=n-1; i>=0; i--){ 4832 v = aa + 9*(adiag[i+1]+1); 4833 vi = aj + adiag[i+1]+1; 4834 nz = adiag[i] - adiag[i+1] - 1; 4835 idt = 3*i; 4836 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4837 for(m=0;m<nz;m++){ 4838 idx = 3*vi[m]; 4839 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4840 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4841 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4842 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4843 v += 9; 4844 } 4845 idc = 3*c[i]; 4846 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4847 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4848 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4849 } 4850 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4851 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4852 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4853 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4854 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4855 PetscFunctionReturn(0); 4856 } 4857 4858 /* 4859 Special case where the matrix was ILU(0) factored in the natural 4860 ordering. This eliminates the need for the column and row permutation. 4861 */ 4862 #undef __FUNCT__ 4863 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace" 4864 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 4865 { 4866 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4867 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4868 PetscErrorCode ierr; 4869 const PetscInt *diag = a->diag,*vi; 4870 const MatScalar *aa=a->a,*v; 4871 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4872 const PetscScalar *b; 4873 PetscInt jdx,idt,idx,nz,i; 4874 4875 PetscFunctionBegin; 4876 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4877 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4878 4879 /* forward solve the lower triangular */ 4880 idx = 0; 4881 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4882 for (i=1; i<n; i++) { 4883 v = aa + 9*ai[i]; 4884 vi = aj + ai[i]; 4885 nz = diag[i] - ai[i]; 4886 idx += 3; 4887 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4888 while (nz--) { 4889 jdx = 3*(*vi++); 4890 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4891 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4892 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4893 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4894 v += 9; 4895 } 4896 x[idx] = s1; 4897 x[1+idx] = s2; 4898 x[2+idx] = s3; 4899 } 4900 /* backward solve the upper triangular */ 4901 for (i=n-1; i>=0; i--){ 4902 v = aa + 9*diag[i] + 9; 4903 vi = aj + diag[i] + 1; 4904 nz = ai[i+1] - diag[i] - 1; 4905 idt = 3*i; 4906 s1 = x[idt]; s2 = x[1+idt]; 4907 s3 = x[2+idt]; 4908 while (nz--) { 4909 idx = 3*(*vi++); 4910 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4911 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4912 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4913 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4914 v += 9; 4915 } 4916 v = aa + 9*diag[i]; 4917 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4918 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4919 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4920 } 4921 4922 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4923 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4924 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4925 PetscFunctionReturn(0); 4926 } 4927 4928 #undef __FUNCT__ 4929 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4930 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4931 { 4932 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4933 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 4934 PetscErrorCode ierr; 4935 PetscInt i,k,nz,idx,jdx,idt; 4936 const PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4937 const MatScalar *aa=a->a,*v; 4938 PetscScalar *x; 4939 const PetscScalar *b; 4940 PetscScalar s1,s2,s3,x1,x2,x3; 4941 4942 PetscFunctionBegin; 4943 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4944 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4945 /* forward solve the lower triangular */ 4946 idx = 0; 4947 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4948 for (i=1; i<n; i++) { 4949 v = aa + bs2*ai[i]; 4950 vi = aj + ai[i]; 4951 nz = ai[i+1] - ai[i]; 4952 idx = bs*i; 4953 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4954 for(k=0;k<nz;k++){ 4955 jdx = bs*vi[k]; 4956 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4957 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4958 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4959 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4960 4961 v += bs2; 4962 } 4963 4964 x[idx] = s1; 4965 x[1+idx] = s2; 4966 x[2+idx] = s3; 4967 } 4968 4969 /* backward solve the upper triangular */ 4970 for (i=n-1; i>=0; i--){ 4971 v = aa + bs2*(adiag[i+1]+1); 4972 vi = aj + adiag[i+1]+1; 4973 nz = adiag[i] - adiag[i+1]-1; 4974 idt = bs*i; 4975 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4976 4977 for(k=0;k<nz;k++){ 4978 idx = bs*vi[k]; 4979 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4980 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4981 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4982 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4983 4984 v += bs2; 4985 } 4986 /* x = inv_diagonal*x */ 4987 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4988 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4989 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4990 4991 } 4992 4993 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4994 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4995 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4996 PetscFunctionReturn(0); 4997 } 4998 4999 #undef __FUNCT__ 5000 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace" 5001 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx) 5002 { 5003 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5004 IS iscol=a->col,isrow=a->row; 5005 PetscErrorCode ierr; 5006 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5007 PetscInt i,nz,idx,idt,idc; 5008 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5009 const MatScalar *aa=a->a,*v; 5010 PetscScalar *x,s1,s2,x1,x2,*t; 5011 const PetscScalar *b; 5012 5013 PetscFunctionBegin; 5014 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5015 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5016 t = a->solve_work; 5017 5018 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5019 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5020 5021 /* forward solve the lower triangular */ 5022 idx = 2*(*r++); 5023 t[0] = b[idx]; t[1] = b[1+idx]; 5024 for (i=1; i<n; i++) { 5025 v = aa + 4*ai[i]; 5026 vi = aj + ai[i]; 5027 nz = diag[i] - ai[i]; 5028 idx = 2*(*r++); 5029 s1 = b[idx]; s2 = b[1+idx]; 5030 while (nz--) { 5031 idx = 2*(*vi++); 5032 x1 = t[idx]; x2 = t[1+idx]; 5033 s1 -= v[0]*x1 + v[2]*x2; 5034 s2 -= v[1]*x1 + v[3]*x2; 5035 v += 4; 5036 } 5037 idx = 2*i; 5038 t[idx] = s1; t[1+idx] = s2; 5039 } 5040 /* backward solve the upper triangular */ 5041 for (i=n-1; i>=0; i--){ 5042 v = aa + 4*diag[i] + 4; 5043 vi = aj + diag[i] + 1; 5044 nz = ai[i+1] - diag[i] - 1; 5045 idt = 2*i; 5046 s1 = t[idt]; s2 = t[1+idt]; 5047 while (nz--) { 5048 idx = 2*(*vi++); 5049 x1 = t[idx]; x2 = t[1+idx]; 5050 s1 -= v[0]*x1 + v[2]*x2; 5051 s2 -= v[1]*x1 + v[3]*x2; 5052 v += 4; 5053 } 5054 idc = 2*(*c--); 5055 v = aa + 4*diag[i]; 5056 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5057 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 5058 } 5059 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5060 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5061 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5062 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5063 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5064 PetscFunctionReturn(0); 5065 } 5066 5067 #undef __FUNCT__ 5068 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 5069 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 5070 { 5071 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5072 IS iscol=a->col,isrow=a->row; 5073 PetscErrorCode ierr; 5074 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5075 PetscInt i,nz,idx,jdx,idt,idc,m; 5076 const PetscInt *r,*c,*rout,*cout; 5077 const MatScalar *aa=a->a,*v; 5078 PetscScalar *x,s1,s2,x1,x2,*t; 5079 const PetscScalar *b; 5080 5081 PetscFunctionBegin; 5082 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5083 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5084 t = a->solve_work; 5085 5086 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5087 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 5088 5089 /* forward solve the lower triangular */ 5090 idx = 2*r[0]; 5091 t[0] = b[idx]; t[1] = b[1+idx]; 5092 for (i=1; i<n; i++) { 5093 v = aa + 4*ai[i]; 5094 vi = aj + ai[i]; 5095 nz = ai[i+1] - ai[i]; 5096 idx = 2*r[i]; 5097 s1 = b[idx]; s2 = b[1+idx]; 5098 for(m=0;m<nz;m++){ 5099 jdx = 2*vi[m]; 5100 x1 = t[jdx]; x2 = t[1+jdx]; 5101 s1 -= v[0]*x1 + v[2]*x2; 5102 s2 -= v[1]*x1 + v[3]*x2; 5103 v += 4; 5104 } 5105 idx = 2*i; 5106 t[idx] = s1; t[1+idx] = s2; 5107 } 5108 /* backward solve the upper triangular */ 5109 for (i=n-1; i>=0; i--){ 5110 v = aa + 4*(adiag[i+1]+1); 5111 vi = aj + adiag[i+1]+1; 5112 nz = adiag[i] - adiag[i+1] - 1; 5113 idt = 2*i; 5114 s1 = t[idt]; s2 = t[1+idt]; 5115 for(m=0;m<nz;m++){ 5116 idx = 2*vi[m]; 5117 x1 = t[idx]; x2 = t[1+idx]; 5118 s1 -= v[0]*x1 + v[2]*x2; 5119 s2 -= v[1]*x1 + v[3]*x2; 5120 v += 4; 5121 } 5122 idc = 2*c[i]; 5123 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 5124 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 5125 } 5126 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5127 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5128 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5129 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5130 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5131 PetscFunctionReturn(0); 5132 } 5133 5134 /* 5135 Special case where the matrix was ILU(0) factored in the natural 5136 ordering. This eliminates the need for the column and row permutation. 5137 */ 5138 #undef __FUNCT__ 5139 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace" 5140 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5141 { 5142 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5143 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5144 PetscErrorCode ierr; 5145 const MatScalar *aa=a->a,*v; 5146 PetscScalar *x,s1,s2,x1,x2; 5147 const PetscScalar *b; 5148 PetscInt jdx,idt,idx,nz,i; 5149 5150 PetscFunctionBegin; 5151 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5152 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5153 5154 /* forward solve the lower triangular */ 5155 idx = 0; 5156 x[0] = b[0]; x[1] = b[1]; 5157 for (i=1; i<n; i++) { 5158 v = aa + 4*ai[i]; 5159 vi = aj + ai[i]; 5160 nz = diag[i] - ai[i]; 5161 idx += 2; 5162 s1 = b[idx];s2 = b[1+idx]; 5163 while (nz--) { 5164 jdx = 2*(*vi++); 5165 x1 = x[jdx];x2 = x[1+jdx]; 5166 s1 -= v[0]*x1 + v[2]*x2; 5167 s2 -= v[1]*x1 + v[3]*x2; 5168 v += 4; 5169 } 5170 x[idx] = s1; 5171 x[1+idx] = s2; 5172 } 5173 /* backward solve the upper triangular */ 5174 for (i=n-1; i>=0; i--){ 5175 v = aa + 4*diag[i] + 4; 5176 vi = aj + diag[i] + 1; 5177 nz = ai[i+1] - diag[i] - 1; 5178 idt = 2*i; 5179 s1 = x[idt]; s2 = x[1+idt]; 5180 while (nz--) { 5181 idx = 2*(*vi++); 5182 x1 = x[idx]; x2 = x[1+idx]; 5183 s1 -= v[0]*x1 + v[2]*x2; 5184 s2 -= v[1]*x1 + v[3]*x2; 5185 v += 4; 5186 } 5187 v = aa + 4*diag[i]; 5188 x[idt] = v[0]*s1 + v[2]*s2; 5189 x[1+idt] = v[1]*s1 + v[3]*s2; 5190 } 5191 5192 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5193 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5194 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5195 PetscFunctionReturn(0); 5196 } 5197 5198 #undef __FUNCT__ 5199 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 5200 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 5201 { 5202 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5203 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag; 5204 PetscInt i,k,nz,idx,idt,jdx; 5205 PetscErrorCode ierr; 5206 const MatScalar *aa=a->a,*v; 5207 PetscScalar *x,s1,s2,x1,x2; 5208 const PetscScalar *b; 5209 5210 PetscFunctionBegin; 5211 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5212 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5213 /* forward solve the lower triangular */ 5214 idx = 0; 5215 x[0] = b[idx]; x[1] = b[1+idx]; 5216 for (i=1; i<n; i++) { 5217 v = aa + 4*ai[i]; 5218 vi = aj + ai[i]; 5219 nz = ai[i+1] - ai[i]; 5220 idx = 2*i; 5221 s1 = b[idx];s2 = b[1+idx]; 5222 for(k=0;k<nz;k++){ 5223 jdx = 2*vi[k]; 5224 x1 = x[jdx];x2 = x[1+jdx]; 5225 s1 -= v[0]*x1 + v[2]*x2; 5226 s2 -= v[1]*x1 + v[3]*x2; 5227 v += 4; 5228 } 5229 x[idx] = s1; 5230 x[1+idx] = s2; 5231 } 5232 5233 /* backward solve the upper triangular */ 5234 for (i=n-1; i>=0; i--){ 5235 v = aa + 4*(adiag[i+1]+1); 5236 vi = aj + adiag[i+1]+1; 5237 nz = adiag[i] - adiag[i+1]-1; 5238 idt = 2*i; 5239 s1 = x[idt]; s2 = x[1+idt]; 5240 for(k=0;k<nz;k++){ 5241 idx = 2*vi[k]; 5242 x1 = x[idx]; x2 = x[1+idx]; 5243 s1 -= v[0]*x1 + v[2]*x2; 5244 s2 -= v[1]*x1 + v[3]*x2; 5245 v += 4; 5246 } 5247 /* x = inv_diagonal*x */ 5248 x[idt] = v[0]*s1 + v[2]*s2; 5249 x[1+idt] = v[1]*s1 + v[3]*s2; 5250 } 5251 5252 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5253 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5254 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 5255 PetscFunctionReturn(0); 5256 } 5257 5258 #undef __FUNCT__ 5259 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace" 5260 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx) 5261 { 5262 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 5263 IS iscol=a->col,isrow=a->row; 5264 PetscErrorCode ierr; 5265 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j; 5266 PetscInt i,nz; 5267 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 5268 const MatScalar *aa=a->a,*v; 5269 PetscScalar *x,s1,*t; 5270 const PetscScalar *b; 5271 5272 PetscFunctionBegin; 5273 if (!n) PetscFunctionReturn(0); 5274 5275 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5276 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5277 t = a->solve_work; 5278 5279 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5280 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5281 5282 /* forward solve the lower triangular */ 5283 t[0] = b[*r++]; 5284 for (i=1; i<n; i++) { 5285 v = aa + ai[i]; 5286 vi = aj + ai[i]; 5287 nz = diag[i] - ai[i]; 5288 s1 = b[*r++]; 5289 while (nz--) { 5290 s1 -= (*v++)*t[*vi++]; 5291 } 5292 t[i] = s1; 5293 } 5294 /* backward solve the upper triangular */ 5295 for (i=n-1; i>=0; i--){ 5296 v = aa + diag[i] + 1; 5297 vi = aj + diag[i] + 1; 5298 nz = ai[i+1] - diag[i] - 1; 5299 s1 = t[i]; 5300 while (nz--) { 5301 s1 -= (*v++)*t[*vi++]; 5302 } 5303 x[*c--] = t[i] = aa[diag[i]]*s1; 5304 } 5305 5306 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5307 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5308 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5309 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5310 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5311 PetscFunctionReturn(0); 5312 } 5313 /* 5314 Special case where the matrix was ILU(0) factored in the natural 5315 ordering. This eliminates the need for the column and row permutation. 5316 */ 5317 #undef __FUNCT__ 5318 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace" 5319 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx) 5320 { 5321 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5322 const PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 5323 PetscErrorCode ierr; 5324 const MatScalar *aa=a->a,*v; 5325 PetscScalar *x; 5326 const PetscScalar *b; 5327 PetscScalar s1,x1; 5328 PetscInt jdx,idt,idx,nz,i; 5329 5330 PetscFunctionBegin; 5331 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5332 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5333 5334 /* forward solve the lower triangular */ 5335 idx = 0; 5336 x[0] = b[0]; 5337 for (i=1; i<n; i++) { 5338 v = aa + ai[i]; 5339 vi = aj + ai[i]; 5340 nz = diag[i] - ai[i]; 5341 idx += 1; 5342 s1 = b[idx]; 5343 while (nz--) { 5344 jdx = *vi++; 5345 x1 = x[jdx]; 5346 s1 -= v[0]*x1; 5347 v += 1; 5348 } 5349 x[idx] = s1; 5350 } 5351 /* backward solve the upper triangular */ 5352 for (i=n-1; i>=0; i--){ 5353 v = aa + diag[i] + 1; 5354 vi = aj + diag[i] + 1; 5355 nz = ai[i+1] - diag[i] - 1; 5356 idt = i; 5357 s1 = x[idt]; 5358 while (nz--) { 5359 idx = *vi++; 5360 x1 = x[idx]; 5361 s1 -= v[0]*x1; 5362 v += 1; 5363 } 5364 v = aa + diag[i]; 5365 x[idt] = v[0]*s1; 5366 } 5367 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 5368 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5369 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5370 PetscFunctionReturn(0); 5371 } 5372 5373 /* ----------------------------------------------------------------*/ 5374 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 5375 5376 #undef __FUNCT__ 5377 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering" 5378 /* 5379 This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes 5380 */ 5381 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info) 5382 { 5383 Mat C=B; 5384 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5385 PetscErrorCode ierr; 5386 PetscInt i,j,k,ipvt[15]; 5387 const PetscInt n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj; 5388 PetscInt nz,nzL,row; 5389 MatScalar *rtmp,*pc,*mwork,*pv,*vv,work[225]; 5390 const MatScalar *v,*aa=a->a; 5391 PetscInt bs2 = a->bs2,bs=A->rmap->bs,flg; 5392 5393 PetscFunctionBegin; 5394 5395 /* generate work space needed by the factorization */ 5396 ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr); 5397 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5398 5399 for (i=0; i<n; i++){ 5400 /* zero rtmp */ 5401 /* L part */ 5402 nz = bi[i+1] - bi[i]; 5403 bjtmp = bj + bi[i]; 5404 for (j=0; j<nz; j++){ 5405 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5406 } 5407 5408 /* U part */ 5409 nz = bdiag[i] - bdiag[i+1]; 5410 bjtmp = bj + bdiag[i+1]+1; 5411 for (j=0; j<nz; j++){ 5412 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5413 } 5414 5415 /* load in initial (unfactored row) */ 5416 nz = ai[i+1] - ai[i]; 5417 ajtmp = aj + ai[i]; 5418 v = aa + bs2*ai[i]; 5419 for (j=0; j<nz; j++) { 5420 ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5421 } 5422 5423 /* elimination */ 5424 bjtmp = bj + bi[i]; 5425 nzL = bi[i+1] - bi[i]; 5426 for(k=0;k < nzL;k++) { 5427 row = bjtmp[k]; 5428 pc = rtmp + bs2*row; 5429 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5430 if (flg) { 5431 pv = b->a + bs2*bdiag[row]; 5432 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); 5433 /*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/ 5434 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5435 pv = b->a + bs2*(bdiag[row+1]+1); 5436 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5437 for (j=0; j<nz; j++) { 5438 vv = rtmp + bs2*pj[j]; 5439 Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv); 5440 /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */ 5441 pv += bs2; 5442 } 5443 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5444 } 5445 } 5446 5447 /* finished row so stick it into b->a */ 5448 /* L part */ 5449 pv = b->a + bs2*bi[i] ; 5450 pj = b->j + bi[i] ; 5451 nz = bi[i+1] - bi[i]; 5452 for (j=0; j<nz; j++) { 5453 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5454 } 5455 5456 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5457 pv = b->a + bs2*bdiag[i]; 5458 pj = b->j + bdiag[i]; 5459 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5460 /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */ 5461 ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftinblocks);CHKERRQ(ierr); 5462 5463 /* U part */ 5464 pv = b->a + bs2*(bdiag[i+1]+1); 5465 pj = b->j + bdiag[i+1]+1; 5466 nz = bdiag[i] - bdiag[i+1] - 1; 5467 for (j=0; j<nz; j++){ 5468 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5469 } 5470 } 5471 5472 ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr); 5473 C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering; 5474 C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering; 5475 C->assembled = PETSC_TRUE; 5476 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5477 PetscFunctionReturn(0); 5478 } 5479 5480 #undef __FUNCT__ 5481 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N" 5482 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info) 5483 { 5484 Mat C=B; 5485 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5486 IS isrow = b->row,isicol = b->icol; 5487 PetscErrorCode ierr; 5488 const PetscInt *r,*ic,*ics; 5489 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5490 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5491 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5492 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5493 MatScalar *v_work; 5494 PetscTruth col_identity,row_identity,both_identity; 5495 5496 PetscFunctionBegin; 5497 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5498 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5499 5500 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5501 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5502 ics = ic; 5503 5504 /* generate work space needed by dense LU factorization */ 5505 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5506 5507 for (i=0; i<n; i++){ 5508 /* zero rtmp */ 5509 /* L part */ 5510 nz = bi[i+1] - bi[i]; 5511 bjtmp = bj + bi[i]; 5512 for (j=0; j<nz; j++){ 5513 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5514 } 5515 5516 /* U part */ 5517 nz = bdiag[i] - bdiag[i+1]; 5518 bjtmp = bj + bdiag[i+1]+1; 5519 for (j=0; j<nz; j++){ 5520 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5521 } 5522 5523 /* load in initial (unfactored row) */ 5524 nz = ai[r[i]+1] - ai[r[i]]; 5525 ajtmp = aj + ai[r[i]]; 5526 v = aa + bs2*ai[r[i]]; 5527 for (j=0; j<nz; j++) { 5528 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5529 } 5530 5531 /* elimination */ 5532 bjtmp = bj + bi[i]; 5533 nzL = bi[i+1] - bi[i]; 5534 for(k=0;k < nzL;k++) { 5535 row = bjtmp[k]; 5536 pc = rtmp + bs2*row; 5537 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5538 if (flg) { 5539 pv = b->a + bs2*bdiag[row]; 5540 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5541 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5542 pv = b->a + bs2*(bdiag[row+1]+1); 5543 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5544 for (j=0; j<nz; j++) { 5545 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5546 } 5547 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5548 } 5549 } 5550 5551 /* finished row so stick it into b->a */ 5552 /* L part */ 5553 pv = b->a + bs2*bi[i] ; 5554 pj = b->j + bi[i] ; 5555 nz = bi[i+1] - bi[i]; 5556 for (j=0; j<nz; j++) { 5557 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5558 } 5559 5560 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5561 pv = b->a + bs2*bdiag[i]; 5562 pj = b->j + bdiag[i]; 5563 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5564 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5565 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5566 5567 /* U part */ 5568 pv = b->a + bs2*(bdiag[i+1]+1); 5569 pj = b->j + bdiag[i+1]+1; 5570 nz = bdiag[i] - bdiag[i+1] - 1; 5571 for (j=0; j<nz; j++){ 5572 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5573 } 5574 } 5575 5576 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5577 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 5578 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5579 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5580 5581 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5582 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5583 both_identity = (PetscTruth) (row_identity && col_identity); 5584 if (both_identity){ 5585 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering; 5586 } else { 5587 C->ops->solve = MatSolve_SeqBAIJ_N; 5588 } 5589 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N; 5590 5591 C->assembled = PETSC_TRUE; 5592 ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5593 PetscFunctionReturn(0); 5594 } 5595 5596 /* 5597 ilu(0) with natural ordering under new data structure. 5598 See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description 5599 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace(). 5600 */ 5601 5602 #undef __FUNCT__ 5603 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0" 5604 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5605 { 5606 5607 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5608 PetscErrorCode ierr; 5609 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5610 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 5611 5612 PetscFunctionBegin; 5613 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5614 b = (Mat_SeqBAIJ*)(fact)->data; 5615 5616 /* allocate matrix arrays for new data structure */ 5617 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 5618 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5619 b->singlemalloc = PETSC_TRUE; 5620 if (!b->diag){ 5621 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5622 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5623 } 5624 bdiag = b->diag; 5625 5626 if (n > 0) { 5627 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5628 } 5629 5630 /* set bi and bj with new data structure */ 5631 bi = b->i; 5632 bj = b->j; 5633 5634 /* L part */ 5635 bi[0] = 0; 5636 for (i=0; i<n; i++){ 5637 nz = adiag[i] - ai[i]; 5638 bi[i+1] = bi[i] + nz; 5639 aj = a->j + ai[i]; 5640 for (j=0; j<nz; j++){ 5641 *bj = aj[j]; bj++; 5642 } 5643 } 5644 5645 /* U part */ 5646 bi_temp = bi[n]; 5647 bdiag[n] = bi[n]-1; 5648 for (i=n-1; i>=0; i--){ 5649 nz = ai[i+1] - adiag[i] - 1; 5650 bi_temp = bi_temp + nz + 1; 5651 aj = a->j + adiag[i] + 1; 5652 for (j=0; j<nz; j++){ 5653 *bj = aj[j]; bj++; 5654 } 5655 /* diag[i] */ 5656 *bj = i; bj++; 5657 bdiag[i] = bi_temp - 1; 5658 } 5659 PetscFunctionReturn(0); 5660 } 5661 5662 #undef __FUNCT__ 5663 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5664 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5665 { 5666 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5667 IS isicol; 5668 PetscErrorCode ierr; 5669 const PetscInt *r,*ic; 5670 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5671 PetscInt *bi,*cols,nnz,*cols_lvl; 5672 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5673 PetscInt i,levels,diagonal_fill; 5674 PetscTruth col_identity,row_identity,both_identity; 5675 PetscReal f; 5676 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5677 PetscBT lnkbt; 5678 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5679 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5680 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5681 PetscTruth missing; 5682 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5683 5684 PetscFunctionBegin; 5685 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5686 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5687 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5688 5689 f = info->fill; 5690 levels = (PetscInt)info->levels; 5691 diagonal_fill = (PetscInt)info->diagonal_fill; 5692 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5693 5694 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5695 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5696 both_identity = (PetscTruth) (row_identity && col_identity); 5697 5698 if (!levels && both_identity) { 5699 /* special case: ilu(0) with natural ordering */ 5700 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5701 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5702 5703 fact->factor = MAT_FACTOR_ILU; 5704 (fact)->info.factor_mallocs = 0; 5705 (fact)->info.fill_ratio_given = info->fill; 5706 (fact)->info.fill_ratio_needed = 1.0; 5707 b = (Mat_SeqBAIJ*)(fact)->data; 5708 b->row = isrow; 5709 b->col = iscol; 5710 b->icol = isicol; 5711 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5712 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5713 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5714 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5715 PetscFunctionReturn(0); 5716 } 5717 5718 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5719 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5720 5721 /* get new row pointers */ 5722 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5723 bi[0] = 0; 5724 /* bdiag is location of diagonal in factor */ 5725 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5726 bdiag[0] = 0; 5727 5728 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 5729 5730 /* create a linked list for storing column indices of the active row */ 5731 nlnk = n + 1; 5732 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5733 5734 /* initial FreeSpace size is f*(ai[n]+1) */ 5735 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5736 current_space = free_space; 5737 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5738 current_space_lvl = free_space_lvl; 5739 5740 for (i=0; i<n; i++) { 5741 nzi = 0; 5742 /* copy current row into linked list */ 5743 nnz = ai[r[i]+1] - ai[r[i]]; 5744 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5745 cols = aj + ai[r[i]]; 5746 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5747 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5748 nzi += nlnk; 5749 5750 /* make sure diagonal entry is included */ 5751 if (diagonal_fill && lnk[i] == -1) { 5752 fm = n; 5753 while (lnk[fm] < i) fm = lnk[fm]; 5754 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5755 lnk[fm] = i; 5756 lnk_lvl[i] = 0; 5757 nzi++; dcount++; 5758 } 5759 5760 /* add pivot rows into the active row */ 5761 nzbd = 0; 5762 prow = lnk[n]; 5763 while (prow < i) { 5764 nnz = bdiag[prow]; 5765 cols = bj_ptr[prow] + nnz + 1; 5766 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5767 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5768 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5769 nzi += nlnk; 5770 prow = lnk[prow]; 5771 nzbd++; 5772 } 5773 bdiag[i] = nzbd; 5774 bi[i+1] = bi[i] + nzi; 5775 5776 /* if free space is not available, make more free space */ 5777 if (current_space->local_remaining<nzi) { 5778 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5779 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5780 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5781 reallocs++; 5782 } 5783 5784 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5785 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5786 bj_ptr[i] = current_space->array; 5787 bjlvl_ptr[i] = current_space_lvl->array; 5788 5789 /* make sure the active row i has diagonal entry */ 5790 if (*(bj_ptr[i]+bdiag[i]) != i) { 5791 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5792 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5793 } 5794 5795 current_space->array += nzi; 5796 current_space->local_used += nzi; 5797 current_space->local_remaining -= nzi; 5798 current_space_lvl->array += nzi; 5799 current_space_lvl->local_used += nzi; 5800 current_space_lvl->local_remaining -= nzi; 5801 } 5802 5803 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5804 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5805 5806 /* destroy list of free space and other temporary arrays */ 5807 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 5808 5809 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5810 ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 5811 5812 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 5813 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5814 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 5815 5816 #if defined(PETSC_USE_INFO) 5817 { 5818 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 5819 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 5820 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5821 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 5822 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5823 if (diagonal_fill) { 5824 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 5825 } 5826 } 5827 #endif 5828 5829 /* put together the new matrix */ 5830 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5831 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5832 b = (Mat_SeqBAIJ*)(fact)->data; 5833 b->free_a = PETSC_TRUE; 5834 b->free_ij = PETSC_TRUE; 5835 b->singlemalloc = PETSC_FALSE; 5836 ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5837 b->j = bj; 5838 b->i = bi; 5839 b->diag = bdiag; 5840 b->free_diag = PETSC_TRUE; 5841 b->ilen = 0; 5842 b->imax = 0; 5843 b->row = isrow; 5844 b->col = iscol; 5845 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5846 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5847 b->icol = isicol; 5848 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5849 /* In b structure: Free imax, ilen, old a, old j. 5850 Allocate bdiag, solve_work, new a, new j */ 5851 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 5852 b->maxnz = b->nz = bdiag[0]+1; 5853 fact->info.factor_mallocs = reallocs; 5854 fact->info.fill_ratio_given = f; 5855 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 5856 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5857 PetscFunctionReturn(0); 5858 } 5859 5860 5861 /* 5862 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 5863 except that the data structure of Mat_SeqAIJ is slightly different. 5864 Not a good example of code reuse. 5865 */ 5866 #undef __FUNCT__ 5867 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace" 5868 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5869 { 5870 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5871 IS isicol; 5872 PetscErrorCode ierr; 5873 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 5874 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5875 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5876 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 5877 PetscTruth col_identity,row_identity,both_identity,flg; 5878 PetscReal f; 5879 5880 PetscFunctionBegin; 5881 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 5882 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 5883 5884 f = info->fill; 5885 levels = (PetscInt)info->levels; 5886 diagonal_fill = (PetscInt)info->diagonal_fill; 5887 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5888 5889 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5890 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5891 both_identity = (PetscTruth) (row_identity && col_identity); 5892 5893 if (!levels && both_identity) { /* special case copy the nonzero structure */ 5894 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 5895 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 5896 5897 fact->factor = MAT_FACTOR_ILU; 5898 b = (Mat_SeqBAIJ*)fact->data; 5899 b->row = isrow; 5900 b->col = iscol; 5901 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5902 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5903 b->icol = isicol; 5904 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5905 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5906 PetscFunctionReturn(0); 5907 } 5908 5909 /* general case perform the symbolic factorization */ 5910 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5911 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5912 5913 /* get new row pointers */ 5914 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 5915 ainew[0] = 0; 5916 /* don't know how many column pointers are needed so estimate */ 5917 jmax = (PetscInt)(f*ai[n] + 1); 5918 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 5919 /* ajfill is level of fill for each fill entry */ 5920 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 5921 /* fill is a linked list of nonzeros in active row */ 5922 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 5923 /* im is level for each filled value */ 5924 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 5925 /* dloc is location of diagonal in factor */ 5926 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 5927 dloc[0] = 0; 5928 for (prow=0; prow<n; prow++) { 5929 5930 /* copy prow into linked list */ 5931 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 5932 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 5933 xi = aj + ai[r[prow]]; 5934 fill[n] = n; 5935 fill[prow] = -1; /* marker for diagonal entry */ 5936 while (nz--) { 5937 fm = n; 5938 idx = ic[*xi++]; 5939 do { 5940 m = fm; 5941 fm = fill[m]; 5942 } while (fm < idx); 5943 fill[m] = idx; 5944 fill[idx] = fm; 5945 im[idx] = 0; 5946 } 5947 5948 /* make sure diagonal entry is included */ 5949 if (diagonal_fill && fill[prow] == -1) { 5950 fm = n; 5951 while (fill[fm] < prow) fm = fill[fm]; 5952 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5953 fill[fm] = prow; 5954 im[prow] = 0; 5955 nzf++; 5956 dcount++; 5957 } 5958 5959 nzi = 0; 5960 row = fill[n]; 5961 while (row < prow) { 5962 incrlev = im[row] + 1; 5963 nz = dloc[row]; 5964 xi = ajnew + ainew[row] + nz + 1; 5965 flev = ajfill + ainew[row] + nz + 1; 5966 nnz = ainew[row+1] - ainew[row] - nz - 1; 5967 fm = row; 5968 while (nnz-- > 0) { 5969 idx = *xi++; 5970 if (*flev + incrlev > levels) { 5971 flev++; 5972 continue; 5973 } 5974 do { 5975 m = fm; 5976 fm = fill[m]; 5977 } while (fm < idx); 5978 if (fm != idx) { 5979 im[idx] = *flev + incrlev; 5980 fill[m] = idx; 5981 fill[idx] = fm; 5982 fm = idx; 5983 nzf++; 5984 } else { 5985 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 5986 } 5987 flev++; 5988 } 5989 row = fill[row]; 5990 nzi++; 5991 } 5992 /* copy new filled row into permanent storage */ 5993 ainew[prow+1] = ainew[prow] + nzf; 5994 if (ainew[prow+1] > jmax) { 5995 5996 /* estimate how much additional space we will need */ 5997 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5998 /* just double the memory each time */ 5999 PetscInt maxadd = jmax; 6000 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 6001 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 6002 jmax += maxadd; 6003 6004 /* allocate a longer ajnew and ajfill */ 6005 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 6006 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6007 ierr = PetscFree(ajnew);CHKERRQ(ierr); 6008 ajnew = xitmp; 6009 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 6010 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 6011 ierr = PetscFree(ajfill);CHKERRQ(ierr); 6012 ajfill = xitmp; 6013 reallocate++; /* count how many reallocations are needed */ 6014 } 6015 xitmp = ajnew + ainew[prow]; 6016 flev = ajfill + ainew[prow]; 6017 dloc[prow] = nzi; 6018 fm = fill[n]; 6019 while (nzf--) { 6020 *xitmp++ = fm; 6021 *flev++ = im[fm]; 6022 fm = fill[fm]; 6023 } 6024 /* make sure row has diagonal entry */ 6025 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 6026 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 6027 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 6028 } 6029 } 6030 ierr = PetscFree(ajfill);CHKERRQ(ierr); 6031 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 6032 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 6033 ierr = PetscFree(fill);CHKERRQ(ierr); 6034 ierr = PetscFree(im);CHKERRQ(ierr); 6035 6036 #if defined(PETSC_USE_INFO) 6037 { 6038 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 6039 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 6040 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 6041 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 6042 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 6043 if (diagonal_fill) { 6044 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 6045 } 6046 } 6047 #endif 6048 6049 /* put together the new matrix */ 6050 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 6051 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 6052 b = (Mat_SeqBAIJ*)fact->data; 6053 b->free_a = PETSC_TRUE; 6054 b->free_ij = PETSC_TRUE; 6055 b->singlemalloc = PETSC_FALSE; 6056 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 6057 b->j = ajnew; 6058 b->i = ainew; 6059 for (i=0; i<n; i++) dloc[i] += ainew[i]; 6060 b->diag = dloc; 6061 b->free_diag = PETSC_TRUE; 6062 b->ilen = 0; 6063 b->imax = 0; 6064 b->row = isrow; 6065 b->col = iscol; 6066 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 6067 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 6068 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 6069 b->icol = isicol; 6070 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 6071 /* In b structure: Free imax, ilen, old a, old j. 6072 Allocate dloc, solve_work, new a, new j */ 6073 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 6074 b->maxnz = b->nz = ainew[n]; 6075 6076 fact->info.factor_mallocs = reallocate; 6077 fact->info.fill_ratio_given = f; 6078 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 6079 6080 ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr); 6081 PetscFunctionReturn(0); 6082 } 6083 6084 #undef __FUNCT__ 6085 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 6086 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 6087 { 6088 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 6089 /* int i,*AJ=a->j,nz=a->nz; */ 6090 PetscFunctionBegin; 6091 /* Undo Column scaling */ 6092 /* while (nz--) { */ 6093 /* AJ[i] = AJ[i]/4; */ 6094 /* } */ 6095 /* This should really invoke a push/pop logic, but we don't have that yet. */ 6096 A->ops->setunfactored = PETSC_NULL; 6097 PetscFunctionReturn(0); 6098 } 6099 6100 #undef __FUNCT__ 6101 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 6102 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 6103 { 6104 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 6105 PetscInt *AJ=a->j,nz=a->nz; 6106 unsigned short *aj=(unsigned short *)AJ; 6107 PetscFunctionBegin; 6108 /* Is this really necessary? */ 6109 while (nz--) { 6110 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 6111 } 6112 A->ops->setunfactored = PETSC_NULL; 6113 PetscFunctionReturn(0); 6114 } 6115 6116 6117