1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 #include "petscbt.h" 11 #include "../src/mat/utils/freespace.h" 12 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16 { 17 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18 PetscErrorCode ierr; 19 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20 PetscInt *diag = a->diag; 21 MatScalar *aa=a->a,*v; 22 PetscScalar s1,*x,*b; 23 24 PetscFunctionBegin; 25 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 26 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 29 /* forward solve the U^T */ 30 for (i=0; i<n; i++) { 31 32 v = aa + diag[i]; 33 /* multiply by the inverse of the block diagonal */ 34 s1 = (*v++)*x[i]; 35 vi = aj + diag[i] + 1; 36 nz = ai[i+1] - diag[i] - 1; 37 while (nz--) { 38 x[*vi++] -= (*v++)*s1; 39 } 40 x[i] = s1; 41 } 42 /* backward solve the L^T */ 43 for (i=n-1; i>=0; i--){ 44 v = aa + diag[i] - 1; 45 vi = aj + diag[i] - 1; 46 nz = diag[i] - ai[i]; 47 s1 = x[i]; 48 while (nz--) { 49 x[*vi--] -= (*v--)*s1; 50 } 51 } 52 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 53 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55 PetscFunctionReturn(0); 56 } 57 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63 PetscErrorCode ierr; 64 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65 PetscInt *diag = a->diag,oidx; 66 MatScalar *aa=a->a,*v; 67 PetscScalar s1,s2,x1,x2; 68 PetscScalar *x,*b; 69 70 PetscFunctionBegin; 71 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 73 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74 75 /* forward solve the U^T */ 76 idx = 0; 77 for (i=0; i<n; i++) { 78 79 v = aa + 4*diag[i]; 80 /* multiply by the inverse of the block diagonal */ 81 x1 = x[idx]; x2 = x[1+idx]; 82 s1 = v[0]*x1 + v[1]*x2; 83 s2 = v[2]*x1 + v[3]*x2; 84 v += 4; 85 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 oidx = 2*(*vi++); 90 x[oidx] -= v[0]*s1 + v[1]*s2; 91 x[oidx+1] -= v[2]*s1 + v[3]*s2; 92 v += 4; 93 } 94 x[idx] = s1;x[1+idx] = s2; 95 idx += 2; 96 } 97 /* backward solve the L^T */ 98 for (i=n-1; i>=0; i--){ 99 v = aa + 4*diag[i] - 4; 100 vi = aj + diag[i] - 1; 101 nz = diag[i] - ai[i]; 102 idt = 2*i; 103 s1 = x[idt]; s2 = x[1+idt]; 104 while (nz--) { 105 idx = 2*(*vi--); 106 x[idx] -= v[0]*s1 + v[1]*s2; 107 x[idx+1] -= v[2]*s1 + v[3]*s2; 108 v -= 4; 109 } 110 } 111 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct" 119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 120 { 121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122 PetscErrorCode ierr; 123 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 124 PetscInt nz,idx,idt,j,i,oidx; 125 PetscInt bs=A->rmap->bs,bs2=a->bs2; 126 MatScalar *aa=a->a,*v; 127 PetscScalar s1,s2,x1,x2; 128 PetscScalar *x,*b; 129 130 PetscFunctionBegin; 131 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 132 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 133 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 134 135 /* forward solve the U^T */ 136 idx = 0; 137 for (i=0; i<n; i++) { 138 v = aa + bs2*diag[i]; 139 /* multiply by the inverse of the block diagonal */ 140 x1 = x[idx]; x2 = x[1+idx]; 141 s1 = v[0]*x1 + v[1]*x2; 142 s2 = v[2]*x1 + v[3]*x2; 143 v -= bs2; 144 145 vi = aj + diag[i] - 1; 146 nz = diag[i] - diag[i+1] - 1; 147 for(j=0;j>-nz;j--){ 148 oidx = bs*vi[j]; 149 x[oidx] -= v[0]*s1 + v[1]*s2; 150 x[oidx+1] -= v[2]*s1 + v[3]*s2; 151 v -= bs2; 152 } 153 x[idx] = s1;x[1+idx] = s2; 154 idx += bs; 155 } 156 /* backward solve the L^T */ 157 for (i=n-1; i>=0; i--){ 158 v = aa + bs2*ai[i]; 159 vi = aj + ai[i]; 160 nz = ai[i+1] - ai[i]; 161 idt = bs*i; 162 s1 = x[idt]; s2 = x[1+idt]; 163 for(j=0;j<nz;j++){ 164 idx = bs*vi[j]; 165 x[idx] -= v[0]*s1 + v[1]*s2; 166 x[idx+1] -= v[2]*s1 + v[3]*s2; 167 v += bs2; 168 } 169 } 170 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 172 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 179 { 180 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 181 PetscErrorCode ierr; 182 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 183 PetscInt *diag = a->diag,oidx; 184 MatScalar *aa=a->a,*v; 185 PetscScalar s1,s2,s3,x1,x2,x3; 186 PetscScalar *x,*b; 187 188 PetscFunctionBegin; 189 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 190 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 191 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192 193 /* forward solve the U^T */ 194 idx = 0; 195 for (i=0; i<n; i++) { 196 197 v = aa + 9*diag[i]; 198 /* multiply by the inverse of the block diagonal */ 199 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 200 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 201 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 202 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 203 v += 9; 204 205 vi = aj + diag[i] + 1; 206 nz = ai[i+1] - diag[i] - 1; 207 while (nz--) { 208 oidx = 3*(*vi++); 209 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 210 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 211 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 212 v += 9; 213 } 214 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 215 idx += 3; 216 } 217 /* backward solve the L^T */ 218 for (i=n-1; i>=0; i--){ 219 v = aa + 9*diag[i] - 9; 220 vi = aj + diag[i] - 1; 221 nz = diag[i] - ai[i]; 222 idt = 3*i; 223 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 224 while (nz--) { 225 idx = 3*(*vi--); 226 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 227 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 228 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 229 v -= 9; 230 } 231 } 232 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 233 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct" 240 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 241 { 242 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 243 PetscErrorCode ierr; 244 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 245 PetscInt nz,idx,idt,j,i,oidx; 246 PetscInt bs=A->rmap->bs,bs2=a->bs2; 247 MatScalar *aa=a->a,*v; 248 PetscScalar s1,s2,s3,x1,x2,x3; 249 PetscScalar *x,*b; 250 251 PetscFunctionBegin; 252 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 253 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 254 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 255 256 /* forward solve the U^T */ 257 idx = 0; 258 for (i=0; i<n; i++) { 259 v = aa + bs2*diag[i]; 260 /* multiply by the inverse of the block diagonal */ 261 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 262 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 263 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 264 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 265 v -= bs2; 266 267 vi = aj + diag[i] - 1; 268 nz = diag[i] - diag[i+1] - 1; 269 for(j=0;j>-nz;j--){ 270 oidx = bs*vi[j]; 271 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 272 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 273 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 274 v -= bs2; 275 } 276 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 277 idx += bs; 278 } 279 /* backward solve the L^T */ 280 for (i=n-1; i>=0; i--){ 281 v = aa + bs2*ai[i]; 282 vi = aj + ai[i]; 283 nz = ai[i+1] - ai[i]; 284 idt = bs*i; 285 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 286 for(j=0;j<nz;j++){ 287 idx = bs*vi[j]; 288 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 289 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 290 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 291 v += bs2; 292 } 293 } 294 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 295 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 296 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 297 PetscFunctionReturn(0); 298 } 299 300 #undef __FUNCT__ 301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 302 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 303 { 304 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 305 PetscErrorCode ierr; 306 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 307 PetscInt *diag = a->diag,oidx; 308 MatScalar *aa=a->a,*v; 309 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 310 PetscScalar *x,*b; 311 312 PetscFunctionBegin; 313 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 314 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 315 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 316 317 /* forward solve the U^T */ 318 idx = 0; 319 for (i=0; i<n; i++) { 320 321 v = aa + 16*diag[i]; 322 /* multiply by the inverse of the block diagonal */ 323 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 324 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 325 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 326 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 327 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 328 v += 16; 329 330 vi = aj + diag[i] + 1; 331 nz = ai[i+1] - diag[i] - 1; 332 while (nz--) { 333 oidx = 4*(*vi++); 334 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 335 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 336 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 337 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 338 v += 16; 339 } 340 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 341 idx += 4; 342 } 343 /* backward solve the L^T */ 344 for (i=n-1; i>=0; i--){ 345 v = aa + 16*diag[i] - 16; 346 vi = aj + diag[i] - 1; 347 nz = diag[i] - ai[i]; 348 idt = 4*i; 349 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 350 while (nz--) { 351 idx = 4*(*vi--); 352 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 353 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 354 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 355 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 356 v -= 16; 357 } 358 } 359 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 360 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 361 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 362 PetscFunctionReturn(0); 363 } 364 365 #undef __FUNCT__ 366 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct" 367 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 368 { 369 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 370 PetscErrorCode ierr; 371 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 372 PetscInt nz,idx,idt,j,i,oidx; 373 PetscInt bs=A->rmap->bs,bs2=a->bs2; 374 MatScalar *aa=a->a,*v; 375 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 376 PetscScalar *x,*b; 377 378 PetscFunctionBegin; 379 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 380 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 381 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 382 383 /* forward solve the U^T */ 384 idx = 0; 385 for (i=0; i<n; i++) { 386 v = aa + bs2*diag[i]; 387 /* multiply by the inverse of the block diagonal */ 388 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 389 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 390 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 391 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 392 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 393 v -= bs2; 394 395 vi = aj + diag[i] - 1; 396 nz = diag[i] - diag[i+1] - 1; 397 for(j=0;j>-nz;j--){ 398 oidx = bs*vi[j]; 399 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 400 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 401 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 402 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 403 v -= bs2; 404 } 405 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 406 idx += bs; 407 } 408 /* backward solve the L^T */ 409 for (i=n-1; i>=0; i--){ 410 v = aa + bs2*ai[i]; 411 vi = aj + ai[i]; 412 nz = ai[i+1] - ai[i]; 413 idt = bs*i; 414 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 415 for(j=0;j<nz;j++){ 416 idx = bs*vi[j]; 417 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 418 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 419 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 420 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 421 v += bs2; 422 } 423 } 424 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 425 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 426 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 427 PetscFunctionReturn(0); 428 } 429 430 #undef __FUNCT__ 431 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 432 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 433 { 434 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 435 PetscErrorCode ierr; 436 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 437 PetscInt *diag = a->diag,oidx; 438 MatScalar *aa=a->a,*v; 439 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 440 PetscScalar *x,*b; 441 442 PetscFunctionBegin; 443 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 444 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 445 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 446 447 /* forward solve the U^T */ 448 idx = 0; 449 for (i=0; i<n; i++) { 450 451 v = aa + 25*diag[i]; 452 /* multiply by the inverse of the block diagonal */ 453 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 454 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 455 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 456 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 457 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 458 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 459 v += 25; 460 461 vi = aj + diag[i] + 1; 462 nz = ai[i+1] - diag[i] - 1; 463 while (nz--) { 464 oidx = 5*(*vi++); 465 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 466 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 467 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 468 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 469 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 470 v += 25; 471 } 472 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 473 idx += 5; 474 } 475 /* backward solve the L^T */ 476 for (i=n-1; i>=0; i--){ 477 v = aa + 25*diag[i] - 25; 478 vi = aj + diag[i] - 1; 479 nz = diag[i] - ai[i]; 480 idt = 5*i; 481 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 482 while (nz--) { 483 idx = 5*(*vi--); 484 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 485 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 486 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 487 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 488 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 489 v -= 25; 490 } 491 } 492 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 493 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 494 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 #undef __FUNCT__ 499 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct" 500 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 501 { 502 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 503 PetscErrorCode ierr; 504 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 505 PetscInt nz,idx,idt,j,i,oidx; 506 PetscInt bs=A->rmap->bs,bs2=a->bs2; 507 MatScalar *aa=a->a,*v; 508 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 509 PetscScalar *x,*b; 510 511 PetscFunctionBegin; 512 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 513 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 514 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 515 516 /* forward solve the U^T */ 517 idx = 0; 518 for (i=0; i<n; i++) { 519 v = aa + bs2*diag[i]; 520 /* multiply by the inverse of the block diagonal */ 521 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 522 x5 = x[4+idx]; 523 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 524 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 525 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 526 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 527 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 528 v -= bs2; 529 530 vi = aj + diag[i] - 1; 531 nz = diag[i] - diag[i+1] - 1; 532 for(j=0;j>-nz;j--){ 533 oidx = bs*vi[j]; 534 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 535 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 536 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 537 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 538 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 539 v -= bs2; 540 } 541 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 542 idx += bs; 543 } 544 /* backward solve the L^T */ 545 for (i=n-1; i>=0; i--){ 546 v = aa + bs2*ai[i]; 547 vi = aj + ai[i]; 548 nz = ai[i+1] - ai[i]; 549 idt = bs*i; 550 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 551 for(j=0;j<nz;j++){ 552 idx = bs*vi[j]; 553 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 554 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 555 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 556 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 557 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 558 v += bs2; 559 } 560 } 561 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 562 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 563 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 564 PetscFunctionReturn(0); 565 } 566 567 #undef __FUNCT__ 568 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 569 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 570 { 571 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 572 PetscErrorCode ierr; 573 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 574 PetscInt *diag = a->diag,oidx; 575 MatScalar *aa=a->a,*v; 576 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 577 PetscScalar *x,*b; 578 579 PetscFunctionBegin; 580 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 581 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 582 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 583 584 /* forward solve the U^T */ 585 idx = 0; 586 for (i=0; i<n; i++) { 587 588 v = aa + 36*diag[i]; 589 /* multiply by the inverse of the block diagonal */ 590 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 591 x6 = x[5+idx]; 592 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 593 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 594 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 595 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 596 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 597 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 598 v += 36; 599 600 vi = aj + diag[i] + 1; 601 nz = ai[i+1] - diag[i] - 1; 602 while (nz--) { 603 oidx = 6*(*vi++); 604 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 605 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 606 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 607 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 608 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 609 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 610 v += 36; 611 } 612 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 613 x[5+idx] = s6; 614 idx += 6; 615 } 616 /* backward solve the L^T */ 617 for (i=n-1; i>=0; i--){ 618 v = aa + 36*diag[i] - 36; 619 vi = aj + diag[i] - 1; 620 nz = diag[i] - ai[i]; 621 idt = 6*i; 622 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 623 s6 = x[5+idt]; 624 while (nz--) { 625 idx = 6*(*vi--); 626 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632 v -= 36; 633 } 634 } 635 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 636 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 637 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 638 PetscFunctionReturn(0); 639 } 640 641 #undef __FUNCT__ 642 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct" 643 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 644 { 645 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 646 PetscErrorCode ierr; 647 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 648 PetscInt nz,idx,idt,j,i,oidx; 649 PetscInt bs=A->rmap->bs,bs2=a->bs2; 650 MatScalar *aa=a->a,*v; 651 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 652 PetscScalar *x,*b; 653 654 PetscFunctionBegin; 655 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 656 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 657 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 658 659 /* forward solve the U^T */ 660 idx = 0; 661 for (i=0; i<n; i++) { 662 v = aa + bs2*diag[i]; 663 /* multiply by the inverse of the block diagonal */ 664 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 665 x5 = x[4+idx]; x6 = x[5+idx]; 666 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 667 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 668 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 669 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 670 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 671 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 672 v -= bs2; 673 674 vi = aj + diag[i] - 1; 675 nz = diag[i] - diag[i+1] - 1; 676 for(j=0;j>-nz;j--){ 677 oidx = bs*vi[j]; 678 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 679 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 680 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 681 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 682 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 683 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 684 v -= bs2; 685 } 686 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 687 x[5+idx] = s6; 688 idx += bs; 689 } 690 /* backward solve the L^T */ 691 for (i=n-1; i>=0; i--){ 692 v = aa + bs2*ai[i]; 693 vi = aj + ai[i]; 694 nz = ai[i+1] - ai[i]; 695 idt = bs*i; 696 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 697 s6 = x[5+idt]; 698 for(j=0;j<nz;j++){ 699 idx = bs*vi[j]; 700 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 701 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 702 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 703 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 704 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 705 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 706 v += bs2; 707 } 708 } 709 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 710 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 711 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 712 PetscFunctionReturn(0); 713 } 714 715 #undef __FUNCT__ 716 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 717 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 718 { 719 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 720 PetscErrorCode ierr; 721 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 722 PetscInt *diag = a->diag,oidx; 723 MatScalar *aa=a->a,*v; 724 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 725 PetscScalar *x,*b; 726 727 PetscFunctionBegin; 728 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 729 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 730 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 731 732 /* forward solve the U^T */ 733 idx = 0; 734 for (i=0; i<n; i++) { 735 736 v = aa + 49*diag[i]; 737 /* multiply by the inverse of the block diagonal */ 738 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 739 x6 = x[5+idx]; x7 = x[6+idx]; 740 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 741 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 742 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 743 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 744 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 745 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 746 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 747 v += 49; 748 749 vi = aj + diag[i] + 1; 750 nz = ai[i+1] - diag[i] - 1; 751 while (nz--) { 752 oidx = 7*(*vi++); 753 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 754 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 755 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 756 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 757 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 758 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 759 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 760 v += 49; 761 } 762 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 763 x[5+idx] = s6;x[6+idx] = s7; 764 idx += 7; 765 } 766 /* backward solve the L^T */ 767 for (i=n-1; i>=0; i--){ 768 v = aa + 49*diag[i] - 49; 769 vi = aj + diag[i] - 1; 770 nz = diag[i] - ai[i]; 771 idt = 7*i; 772 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 773 s6 = x[5+idt];s7 = x[6+idt]; 774 while (nz--) { 775 idx = 7*(*vi--); 776 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 777 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 778 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 779 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 780 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 781 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 782 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 783 v -= 49; 784 } 785 } 786 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 787 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 788 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 789 PetscFunctionReturn(0); 790 } 791 #undef __FUNCT__ 792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct" 793 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 794 { 795 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 796 PetscErrorCode ierr; 797 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 798 PetscInt nz,idx,idt,j,i,oidx; 799 PetscInt bs=A->rmap->bs,bs2=a->bs2; 800 MatScalar *aa=a->a,*v; 801 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 802 PetscScalar *x,*b; 803 804 PetscFunctionBegin; 805 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 806 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 807 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 808 809 /* forward solve the U^T */ 810 idx = 0; 811 for (i=0; i<n; i++) { 812 v = aa + bs2*diag[i]; 813 /* multiply by the inverse of the block diagonal */ 814 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 815 x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 816 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 817 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 818 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 819 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 820 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 821 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 822 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 823 v -= bs2; 824 vi = aj + diag[i] - 1; 825 nz = diag[i] - diag[i+1] - 1; 826 for(j=0;j>-nz;j--){ 827 oidx = bs*vi[j]; 828 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 829 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 830 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 831 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 832 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 833 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 834 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 835 v -= bs2; 836 } 837 x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 838 x[5+idx] = s6; x[6+idx] = s7; 839 idx += bs; 840 } 841 /* backward solve the L^T */ 842 for (i=n-1; i>=0; i--){ 843 v = aa + bs2*ai[i]; 844 vi = aj + ai[i]; 845 nz = ai[i+1] - ai[i]; 846 idt = bs*i; 847 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 848 s6 = x[5+idt]; s7 = x[6+idt]; 849 for(j=0;j<nz;j++){ 850 idx = bs*vi[j]; 851 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 852 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 853 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 854 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 855 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 856 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 857 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 858 v += bs2; 859 } 860 } 861 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 862 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 863 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 864 PetscFunctionReturn(0); 865 } 866 867 /*---------------------------------------------------------------------------------------------*/ 868 #undef __FUNCT__ 869 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 870 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 871 { 872 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 873 IS iscol=a->col,isrow=a->row; 874 PetscErrorCode ierr; 875 const PetscInt *r,*c,*rout,*cout; 876 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 877 PetscInt *diag = a->diag; 878 MatScalar *aa=a->a,*v; 879 PetscScalar s1,*x,*b,*t; 880 881 PetscFunctionBegin; 882 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 883 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 884 t = a->solve_work; 885 886 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 887 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 888 889 /* copy the b into temp work space according to permutation */ 890 for (i=0; i<n; i++) { 891 t[i] = b[c[i]]; 892 } 893 894 /* forward solve the U^T */ 895 for (i=0; i<n; i++) { 896 897 v = aa + diag[i]; 898 /* multiply by the inverse of the block diagonal */ 899 s1 = (*v++)*t[i]; 900 vi = aj + diag[i] + 1; 901 nz = ai[i+1] - diag[i] - 1; 902 while (nz--) { 903 t[*vi++] -= (*v++)*s1; 904 } 905 t[i] = s1; 906 } 907 /* backward solve the L^T */ 908 for (i=n-1; i>=0; i--){ 909 v = aa + diag[i] - 1; 910 vi = aj + diag[i] - 1; 911 nz = diag[i] - ai[i]; 912 s1 = t[i]; 913 while (nz--) { 914 t[*vi--] -= (*v--)*s1; 915 } 916 } 917 918 /* copy t into x according to permutation */ 919 for (i=0; i<n; i++) { 920 x[r[i]] = t[i]; 921 } 922 923 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 924 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 925 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 926 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 927 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 928 PetscFunctionReturn(0); 929 } 930 931 #undef __FUNCT__ 932 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 933 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 934 { 935 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 936 IS iscol=a->col,isrow=a->row; 937 PetscErrorCode ierr; 938 const PetscInt *r,*c,*rout,*cout; 939 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 940 PetscInt *diag = a->diag,ii,ic,ir,oidx; 941 MatScalar *aa=a->a,*v; 942 PetscScalar s1,s2,x1,x2; 943 PetscScalar *x,*b,*t; 944 945 PetscFunctionBegin; 946 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 947 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 948 t = a->solve_work; 949 950 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 951 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 952 953 /* copy the b into temp work space according to permutation */ 954 ii = 0; 955 for (i=0; i<n; i++) { 956 ic = 2*c[i]; 957 t[ii] = b[ic]; 958 t[ii+1] = b[ic+1]; 959 ii += 2; 960 } 961 962 /* forward solve the U^T */ 963 idx = 0; 964 for (i=0; i<n; i++) { 965 966 v = aa + 4*diag[i]; 967 /* multiply by the inverse of the block diagonal */ 968 x1 = t[idx]; x2 = t[1+idx]; 969 s1 = v[0]*x1 + v[1]*x2; 970 s2 = v[2]*x1 + v[3]*x2; 971 v += 4; 972 973 vi = aj + diag[i] + 1; 974 nz = ai[i+1] - diag[i] - 1; 975 while (nz--) { 976 oidx = 2*(*vi++); 977 t[oidx] -= v[0]*s1 + v[1]*s2; 978 t[oidx+1] -= v[2]*s1 + v[3]*s2; 979 v += 4; 980 } 981 t[idx] = s1;t[1+idx] = s2; 982 idx += 2; 983 } 984 /* backward solve the L^T */ 985 for (i=n-1; i>=0; i--){ 986 v = aa + 4*diag[i] - 4; 987 vi = aj + diag[i] - 1; 988 nz = diag[i] - ai[i]; 989 idt = 2*i; 990 s1 = t[idt]; s2 = t[1+idt]; 991 while (nz--) { 992 idx = 2*(*vi--); 993 t[idx] -= v[0]*s1 + v[1]*s2; 994 t[idx+1] -= v[2]*s1 + v[3]*s2; 995 v -= 4; 996 } 997 } 998 999 /* copy t into x according to permutation */ 1000 ii = 0; 1001 for (i=0; i<n; i++) { 1002 ir = 2*r[i]; 1003 x[ir] = t[ii]; 1004 x[ir+1] = t[ii+1]; 1005 ii += 2; 1006 } 1007 1008 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1009 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1010 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1011 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1012 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1013 PetscFunctionReturn(0); 1014 } 1015 1016 #undef __FUNCT__ 1017 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 1018 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1019 { 1020 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1021 IS iscol=a->col,isrow=a->row; 1022 PetscErrorCode ierr; 1023 const PetscInt *r,*c,*rout,*cout; 1024 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1025 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1026 MatScalar *aa=a->a,*v; 1027 PetscScalar s1,s2,s3,x1,x2,x3; 1028 PetscScalar *x,*b,*t; 1029 1030 PetscFunctionBegin; 1031 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1032 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1033 t = a->solve_work; 1034 1035 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1036 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1037 1038 /* copy the b into temp work space according to permutation */ 1039 ii = 0; 1040 for (i=0; i<n; i++) { 1041 ic = 3*c[i]; 1042 t[ii] = b[ic]; 1043 t[ii+1] = b[ic+1]; 1044 t[ii+2] = b[ic+2]; 1045 ii += 3; 1046 } 1047 1048 /* forward solve the U^T */ 1049 idx = 0; 1050 for (i=0; i<n; i++) { 1051 1052 v = aa + 9*diag[i]; 1053 /* multiply by the inverse of the block diagonal */ 1054 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1055 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1056 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1057 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1058 v += 9; 1059 1060 vi = aj + diag[i] + 1; 1061 nz = ai[i+1] - diag[i] - 1; 1062 while (nz--) { 1063 oidx = 3*(*vi++); 1064 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1065 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1066 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1067 v += 9; 1068 } 1069 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1070 idx += 3; 1071 } 1072 /* backward solve the L^T */ 1073 for (i=n-1; i>=0; i--){ 1074 v = aa + 9*diag[i] - 9; 1075 vi = aj + diag[i] - 1; 1076 nz = diag[i] - ai[i]; 1077 idt = 3*i; 1078 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1079 while (nz--) { 1080 idx = 3*(*vi--); 1081 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1082 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1083 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1084 v -= 9; 1085 } 1086 } 1087 1088 /* copy t into x according to permutation */ 1089 ii = 0; 1090 for (i=0; i<n; i++) { 1091 ir = 3*r[i]; 1092 x[ir] = t[ii]; 1093 x[ir+1] = t[ii+1]; 1094 x[ir+2] = t[ii+2]; 1095 ii += 3; 1096 } 1097 1098 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1099 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1100 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1101 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1102 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1103 PetscFunctionReturn(0); 1104 } 1105 1106 #undef __FUNCT__ 1107 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 1108 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1109 { 1110 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1111 IS iscol=a->col,isrow=a->row; 1112 PetscErrorCode ierr; 1113 const PetscInt *r,*c,*rout,*cout; 1114 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1115 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1116 MatScalar *aa=a->a,*v; 1117 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 1118 PetscScalar *x,*b,*t; 1119 1120 PetscFunctionBegin; 1121 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1122 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1123 t = a->solve_work; 1124 1125 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1126 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1127 1128 /* copy the b into temp work space according to permutation */ 1129 ii = 0; 1130 for (i=0; i<n; i++) { 1131 ic = 4*c[i]; 1132 t[ii] = b[ic]; 1133 t[ii+1] = b[ic+1]; 1134 t[ii+2] = b[ic+2]; 1135 t[ii+3] = b[ic+3]; 1136 ii += 4; 1137 } 1138 1139 /* forward solve the U^T */ 1140 idx = 0; 1141 for (i=0; i<n; i++) { 1142 1143 v = aa + 16*diag[i]; 1144 /* multiply by the inverse of the block diagonal */ 1145 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1146 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1147 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1148 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1149 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1150 v += 16; 1151 1152 vi = aj + diag[i] + 1; 1153 nz = ai[i+1] - diag[i] - 1; 1154 while (nz--) { 1155 oidx = 4*(*vi++); 1156 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1157 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1158 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1159 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1160 v += 16; 1161 } 1162 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1163 idx += 4; 1164 } 1165 /* backward solve the L^T */ 1166 for (i=n-1; i>=0; i--){ 1167 v = aa + 16*diag[i] - 16; 1168 vi = aj + diag[i] - 1; 1169 nz = diag[i] - ai[i]; 1170 idt = 4*i; 1171 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1172 while (nz--) { 1173 idx = 4*(*vi--); 1174 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1175 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1176 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1177 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1178 v -= 16; 1179 } 1180 } 1181 1182 /* copy t into x according to permutation */ 1183 ii = 0; 1184 for (i=0; i<n; i++) { 1185 ir = 4*r[i]; 1186 x[ir] = t[ii]; 1187 x[ir+1] = t[ii+1]; 1188 x[ir+2] = t[ii+2]; 1189 x[ir+3] = t[ii+3]; 1190 ii += 4; 1191 } 1192 1193 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1194 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1195 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1196 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1197 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1198 PetscFunctionReturn(0); 1199 } 1200 1201 #undef __FUNCT__ 1202 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 1203 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1204 { 1205 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1206 IS iscol=a->col,isrow=a->row; 1207 PetscErrorCode ierr; 1208 const PetscInt *r,*c,*rout,*cout; 1209 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1210 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1211 MatScalar *aa=a->a,*v; 1212 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1213 PetscScalar *x,*b,*t; 1214 1215 PetscFunctionBegin; 1216 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1217 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1218 t = a->solve_work; 1219 1220 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1221 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1222 1223 /* copy the b into temp work space according to permutation */ 1224 ii = 0; 1225 for (i=0; i<n; i++) { 1226 ic = 5*c[i]; 1227 t[ii] = b[ic]; 1228 t[ii+1] = b[ic+1]; 1229 t[ii+2] = b[ic+2]; 1230 t[ii+3] = b[ic+3]; 1231 t[ii+4] = b[ic+4]; 1232 ii += 5; 1233 } 1234 1235 /* forward solve the U^T */ 1236 idx = 0; 1237 for (i=0; i<n; i++) { 1238 1239 v = aa + 25*diag[i]; 1240 /* multiply by the inverse of the block diagonal */ 1241 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1242 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1243 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1244 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1245 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1246 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1247 v += 25; 1248 1249 vi = aj + diag[i] + 1; 1250 nz = ai[i+1] - diag[i] - 1; 1251 while (nz--) { 1252 oidx = 5*(*vi++); 1253 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1254 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1255 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1256 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1257 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1258 v += 25; 1259 } 1260 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1261 idx += 5; 1262 } 1263 /* backward solve the L^T */ 1264 for (i=n-1; i>=0; i--){ 1265 v = aa + 25*diag[i] - 25; 1266 vi = aj + diag[i] - 1; 1267 nz = diag[i] - ai[i]; 1268 idt = 5*i; 1269 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1270 while (nz--) { 1271 idx = 5*(*vi--); 1272 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1273 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1274 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1275 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1276 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1277 v -= 25; 1278 } 1279 } 1280 1281 /* copy t into x according to permutation */ 1282 ii = 0; 1283 for (i=0; i<n; i++) { 1284 ir = 5*r[i]; 1285 x[ir] = t[ii]; 1286 x[ir+1] = t[ii+1]; 1287 x[ir+2] = t[ii+2]; 1288 x[ir+3] = t[ii+3]; 1289 x[ir+4] = t[ii+4]; 1290 ii += 5; 1291 } 1292 1293 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1294 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1295 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1296 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1297 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1298 PetscFunctionReturn(0); 1299 } 1300 1301 #undef __FUNCT__ 1302 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 1303 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1304 { 1305 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1306 IS iscol=a->col,isrow=a->row; 1307 PetscErrorCode ierr; 1308 const PetscInt *r,*c,*rout,*cout; 1309 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1310 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1311 MatScalar *aa=a->a,*v; 1312 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1313 PetscScalar *x,*b,*t; 1314 1315 PetscFunctionBegin; 1316 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1317 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1318 t = a->solve_work; 1319 1320 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1321 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1322 1323 /* copy the b into temp work space according to permutation */ 1324 ii = 0; 1325 for (i=0; i<n; i++) { 1326 ic = 6*c[i]; 1327 t[ii] = b[ic]; 1328 t[ii+1] = b[ic+1]; 1329 t[ii+2] = b[ic+2]; 1330 t[ii+3] = b[ic+3]; 1331 t[ii+4] = b[ic+4]; 1332 t[ii+5] = b[ic+5]; 1333 ii += 6; 1334 } 1335 1336 /* forward solve the U^T */ 1337 idx = 0; 1338 for (i=0; i<n; i++) { 1339 1340 v = aa + 36*diag[i]; 1341 /* multiply by the inverse of the block diagonal */ 1342 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1343 x6 = t[5+idx]; 1344 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1345 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1346 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1347 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1348 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1349 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1350 v += 36; 1351 1352 vi = aj + diag[i] + 1; 1353 nz = ai[i+1] - diag[i] - 1; 1354 while (nz--) { 1355 oidx = 6*(*vi++); 1356 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1357 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1358 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1359 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1360 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1361 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1362 v += 36; 1363 } 1364 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1365 t[5+idx] = s6; 1366 idx += 6; 1367 } 1368 /* backward solve the L^T */ 1369 for (i=n-1; i>=0; i--){ 1370 v = aa + 36*diag[i] - 36; 1371 vi = aj + diag[i] - 1; 1372 nz = diag[i] - ai[i]; 1373 idt = 6*i; 1374 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1375 s6 = t[5+idt]; 1376 while (nz--) { 1377 idx = 6*(*vi--); 1378 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1379 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1380 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1381 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1382 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1383 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1384 v -= 36; 1385 } 1386 } 1387 1388 /* copy t into x according to permutation */ 1389 ii = 0; 1390 for (i=0; i<n; i++) { 1391 ir = 6*r[i]; 1392 x[ir] = t[ii]; 1393 x[ir+1] = t[ii+1]; 1394 x[ir+2] = t[ii+2]; 1395 x[ir+3] = t[ii+3]; 1396 x[ir+4] = t[ii+4]; 1397 x[ir+5] = t[ii+5]; 1398 ii += 6; 1399 } 1400 1401 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1402 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1403 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1404 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1405 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1406 PetscFunctionReturn(0); 1407 } 1408 1409 #undef __FUNCT__ 1410 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1411 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1412 { 1413 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1414 IS iscol=a->col,isrow=a->row; 1415 PetscErrorCode ierr; 1416 const PetscInt *r,*c,*rout,*cout; 1417 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1418 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1419 MatScalar *aa=a->a,*v; 1420 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1421 PetscScalar *x,*b,*t; 1422 1423 PetscFunctionBegin; 1424 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1425 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1426 t = a->solve_work; 1427 1428 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1429 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1430 1431 /* copy the b into temp work space according to permutation */ 1432 ii = 0; 1433 for (i=0; i<n; i++) { 1434 ic = 7*c[i]; 1435 t[ii] = b[ic]; 1436 t[ii+1] = b[ic+1]; 1437 t[ii+2] = b[ic+2]; 1438 t[ii+3] = b[ic+3]; 1439 t[ii+4] = b[ic+4]; 1440 t[ii+5] = b[ic+5]; 1441 t[ii+6] = b[ic+6]; 1442 ii += 7; 1443 } 1444 1445 /* forward solve the U^T */ 1446 idx = 0; 1447 for (i=0; i<n; i++) { 1448 1449 v = aa + 49*diag[i]; 1450 /* multiply by the inverse of the block diagonal */ 1451 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1452 x6 = t[5+idx]; x7 = t[6+idx]; 1453 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1454 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1455 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1456 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1457 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1458 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1459 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1460 v += 49; 1461 1462 vi = aj + diag[i] + 1; 1463 nz = ai[i+1] - diag[i] - 1; 1464 while (nz--) { 1465 oidx = 7*(*vi++); 1466 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1467 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1468 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1469 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1470 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1471 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1472 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1473 v += 49; 1474 } 1475 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1476 t[5+idx] = s6;t[6+idx] = s7; 1477 idx += 7; 1478 } 1479 /* backward solve the L^T */ 1480 for (i=n-1; i>=0; i--){ 1481 v = aa + 49*diag[i] - 49; 1482 vi = aj + diag[i] - 1; 1483 nz = diag[i] - ai[i]; 1484 idt = 7*i; 1485 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1486 s6 = t[5+idt];s7 = t[6+idt]; 1487 while (nz--) { 1488 idx = 7*(*vi--); 1489 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1490 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1491 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1492 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1493 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1494 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1495 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1496 v -= 49; 1497 } 1498 } 1499 1500 /* copy t into x according to permutation */ 1501 ii = 0; 1502 for (i=0; i<n; i++) { 1503 ir = 7*r[i]; 1504 x[ir] = t[ii]; 1505 x[ir+1] = t[ii+1]; 1506 x[ir+2] = t[ii+2]; 1507 x[ir+3] = t[ii+3]; 1508 x[ir+4] = t[ii+4]; 1509 x[ir+5] = t[ii+5]; 1510 x[ir+6] = t[ii+6]; 1511 ii += 7; 1512 } 1513 1514 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1515 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1516 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1517 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1518 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1519 PetscFunctionReturn(0); 1520 } 1521 1522 /* ----------------------------------------------------------- */ 1523 #undef __FUNCT__ 1524 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1525 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1526 { 1527 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1528 IS iscol=a->col,isrow=a->row; 1529 PetscErrorCode ierr; 1530 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1531 PetscInt i,n=a->mbs; 1532 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1533 MatScalar *aa=a->a,*v; 1534 PetscScalar *x,*b,*s,*t,*ls; 1535 1536 PetscFunctionBegin; 1537 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1538 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1539 t = a->solve_work; 1540 1541 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1542 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1543 1544 /* forward solve the lower triangular */ 1545 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1546 for (i=1; i<n; i++) { 1547 v = aa + bs2*ai[i]; 1548 vi = aj + ai[i]; 1549 nz = a->diag[i] - ai[i]; 1550 s = t + bs*i; 1551 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1552 while (nz--) { 1553 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1554 v += bs2; 1555 } 1556 } 1557 /* backward solve the upper triangular */ 1558 ls = a->solve_work + A->cmap->n; 1559 for (i=n-1; i>=0; i--){ 1560 v = aa + bs2*(a->diag[i] + 1); 1561 vi = aj + a->diag[i] + 1; 1562 nz = ai[i+1] - a->diag[i] - 1; 1563 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1564 while (nz--) { 1565 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1566 v += bs2; 1567 } 1568 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1569 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1570 } 1571 1572 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1573 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1574 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1575 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1576 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1577 PetscFunctionReturn(0); 1578 } 1579 1580 /* ----------------------------------------------------------- */ 1581 #undef __FUNCT__ 1582 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 1583 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1584 { 1585 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1586 IS iscol=a->col,isrow=a->row; 1587 PetscErrorCode ierr; 1588 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1589 PetscInt i,n=a->mbs,j; 1590 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1591 const MatScalar *aa=a->a,*v; 1592 PetscScalar *x,*t,*ls; 1593 const PetscScalar *b; 1594 PetscFunctionBegin; 1595 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1596 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1597 t = a->solve_work; 1598 1599 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1600 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1601 1602 /* copy the b into temp work space according to permutation */ 1603 for (i=0; i<n; i++) { 1604 for (j=0; j<bs; j++) { 1605 t[i*bs+j] = b[c[i]*bs+j]; 1606 } 1607 } 1608 1609 1610 /* forward solve the upper triangular transpose */ 1611 ls = a->solve_work + A->cmap->n; 1612 for (i=0; i<n; i++){ 1613 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1614 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1615 v = aa + bs2*(a->diag[i] + 1); 1616 vi = aj + a->diag[i] + 1; 1617 nz = ai[i+1] - a->diag[i] - 1; 1618 while (nz--) { 1619 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 1620 v += bs2; 1621 } 1622 } 1623 1624 /* backward solve the lower triangular transpose */ 1625 for (i=n-1; i>=0; i--) { 1626 v = aa + bs2*ai[i]; 1627 vi = aj + ai[i]; 1628 nz = a->diag[i] - ai[i]; 1629 while (nz--) { 1630 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 1631 v += bs2; 1632 } 1633 } 1634 1635 /* copy t into x according to permutation */ 1636 for (i=0; i<n; i++) { 1637 for (j=0; j<bs; j++) { 1638 x[bs*r[i]+j] = t[bs*i+j]; 1639 } 1640 } 1641 1642 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1643 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1644 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1645 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1646 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1647 PetscFunctionReturn(0); 1648 } 1649 1650 #undef __FUNCT__ 1651 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_newdatastruct" 1652 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_newdatastruct(Mat A,Vec bb,Vec xx) 1653 { 1654 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1655 IS iscol=a->col,isrow=a->row; 1656 PetscErrorCode ierr; 1657 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 1658 PetscInt i,n=a->mbs,j; 1659 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1660 const MatScalar *aa=a->a,*v; 1661 PetscScalar *x,*t,*ls; 1662 const PetscScalar *b; 1663 PetscFunctionBegin; 1664 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1665 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1666 t = a->solve_work; 1667 1668 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1669 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1670 1671 /* copy the b into temp work space according to permutation */ 1672 for (i=0; i<n; i++) { 1673 for (j=0; j<bs; j++) { 1674 t[i*bs+j] = b[c[i]*bs+j]; 1675 } 1676 } 1677 1678 1679 /* forward solve the upper triangular transpose */ 1680 ls = a->solve_work + A->cmap->n; 1681 for (i=0; i<n; i++){ 1682 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1683 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 1684 v = aa + bs2*(diag[i] - 1); 1685 vi = aj + diag[i] - 1; 1686 nz = diag[i] - diag[i+1] - 1; 1687 for(j=0;j>-nz;j--){ 1688 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 1689 v -= bs2; 1690 } 1691 } 1692 1693 /* backward solve the lower triangular transpose */ 1694 for (i=n-1; i>=0; i--) { 1695 v = aa + bs2*ai[i]; 1696 vi = aj + ai[i]; 1697 nz = ai[i+1] - ai[i]; 1698 for(j=0;j<nz;j++){ 1699 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 1700 v += bs2; 1701 } 1702 } 1703 1704 /* copy t into x according to permutation */ 1705 for (i=0; i<n; i++) { 1706 for (j=0; j<bs; j++) { 1707 x[bs*r[i]+j] = t[bs*i+j]; 1708 } 1709 } 1710 1711 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1712 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1713 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1714 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1715 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1716 PetscFunctionReturn(0); 1717 } 1718 1719 #undef __FUNCT__ 1720 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1721 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1722 { 1723 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1724 IS iscol=a->col,isrow=a->row; 1725 PetscErrorCode ierr; 1726 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1727 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1728 MatScalar *aa=a->a,*v; 1729 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1730 PetscScalar *x,*b,*t; 1731 1732 PetscFunctionBegin; 1733 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1734 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1735 t = a->solve_work; 1736 1737 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1738 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1739 1740 /* forward solve the lower triangular */ 1741 idx = 7*(*r++); 1742 t[0] = b[idx]; t[1] = b[1+idx]; 1743 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1744 t[5] = b[5+idx]; t[6] = b[6+idx]; 1745 1746 for (i=1; i<n; i++) { 1747 v = aa + 49*ai[i]; 1748 vi = aj + ai[i]; 1749 nz = diag[i] - ai[i]; 1750 idx = 7*(*r++); 1751 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1752 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1753 while (nz--) { 1754 idx = 7*(*vi++); 1755 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1756 x4 = t[3+idx];x5 = t[4+idx]; 1757 x6 = t[5+idx];x7 = t[6+idx]; 1758 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1759 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1760 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1761 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1762 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1763 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1764 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1765 v += 49; 1766 } 1767 idx = 7*i; 1768 t[idx] = s1;t[1+idx] = s2; 1769 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1770 t[5+idx] = s6;t[6+idx] = s7; 1771 } 1772 /* backward solve the upper triangular */ 1773 for (i=n-1; i>=0; i--){ 1774 v = aa + 49*diag[i] + 49; 1775 vi = aj + diag[i] + 1; 1776 nz = ai[i+1] - diag[i] - 1; 1777 idt = 7*i; 1778 s1 = t[idt]; s2 = t[1+idt]; 1779 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1780 s6 = t[5+idt];s7 = t[6+idt]; 1781 while (nz--) { 1782 idx = 7*(*vi++); 1783 x1 = t[idx]; x2 = t[1+idx]; 1784 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1785 x6 = t[5+idx]; x7 = t[6+idx]; 1786 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1787 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1788 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1789 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1790 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1791 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1792 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1793 v += 49; 1794 } 1795 idc = 7*(*c--); 1796 v = aa + 49*diag[i]; 1797 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1798 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1799 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1800 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1801 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1802 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1803 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1804 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1805 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1806 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1807 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1808 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1809 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1810 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1811 } 1812 1813 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1814 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1815 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1816 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1817 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1818 PetscFunctionReturn(0); 1819 } 1820 1821 #undef __FUNCT__ 1822 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 1823 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 1824 { 1825 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1826 IS iscol=a->col,isrow=a->row; 1827 PetscErrorCode ierr; 1828 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 1829 PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 1830 MatScalar *aa=a->a,*v; 1831 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1832 PetscScalar *x,*b,*t; 1833 1834 PetscFunctionBegin; 1835 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1836 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1837 t = a->solve_work; 1838 1839 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1840 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1841 1842 /* forward solve the lower triangular */ 1843 idx = 7*r[0]; 1844 t[0] = b[idx]; t[1] = b[1+idx]; 1845 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1846 t[5] = b[5+idx]; t[6] = b[6+idx]; 1847 1848 for (i=1; i<n; i++) { 1849 v = aa + 49*ai[i]; 1850 vi = aj + ai[i]; 1851 nz = ai[i+1] - ai[i]; 1852 idx = 7*r[i]; 1853 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1854 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1855 for(m=0;m<nz;m++){ 1856 idx = 7*vi[m]; 1857 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1858 x4 = t[3+idx];x5 = t[4+idx]; 1859 x6 = t[5+idx];x7 = t[6+idx]; 1860 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1861 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1862 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1863 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1864 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1865 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1866 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1867 v += 49; 1868 } 1869 idx = 7*i; 1870 t[idx] = s1;t[1+idx] = s2; 1871 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1872 t[5+idx] = s6;t[6+idx] = s7; 1873 } 1874 /* backward solve the upper triangular */ 1875 for (i=n-1; i>=0; i--){ 1876 v = aa + 49*(adiag[i+1]+1); 1877 vi = aj + adiag[i+1]+1; 1878 nz = adiag[i] - adiag[i+1] - 1; 1879 idt = 7*i; 1880 s1 = t[idt]; s2 = t[1+idt]; 1881 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1882 s6 = t[5+idt];s7 = t[6+idt]; 1883 for(m=0;m<nz;m++){ 1884 idx = 7*vi[m]; 1885 x1 = t[idx]; x2 = t[1+idx]; 1886 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1887 x6 = t[5+idx]; x7 = t[6+idx]; 1888 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1889 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1890 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1891 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1892 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1893 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1894 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1895 v += 49; 1896 } 1897 idc = 7*c[i]; 1898 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1899 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1900 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1901 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1902 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1903 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1904 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1905 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1906 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1907 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1908 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1909 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1910 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1911 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1912 } 1913 1914 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1915 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1916 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1917 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1918 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1919 PetscFunctionReturn(0); 1920 } 1921 1922 #undef __FUNCT__ 1923 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1924 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1925 { 1926 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1927 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1928 PetscErrorCode ierr; 1929 PetscInt *diag = a->diag,jdx; 1930 const MatScalar *aa=a->a,*v; 1931 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1932 const PetscScalar *b; 1933 1934 PetscFunctionBegin; 1935 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1936 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1937 /* forward solve the lower triangular */ 1938 idx = 0; 1939 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1940 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1941 x[6] = b[6+idx]; 1942 for (i=1; i<n; i++) { 1943 v = aa + 49*ai[i]; 1944 vi = aj + ai[i]; 1945 nz = diag[i] - ai[i]; 1946 idx = 7*i; 1947 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1948 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1949 s7 = b[6+idx]; 1950 while (nz--) { 1951 jdx = 7*(*vi++); 1952 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1953 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1954 x7 = x[6+jdx]; 1955 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1956 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1957 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1958 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1959 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1960 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1961 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1962 v += 49; 1963 } 1964 x[idx] = s1; 1965 x[1+idx] = s2; 1966 x[2+idx] = s3; 1967 x[3+idx] = s4; 1968 x[4+idx] = s5; 1969 x[5+idx] = s6; 1970 x[6+idx] = s7; 1971 } 1972 /* backward solve the upper triangular */ 1973 for (i=n-1; i>=0; i--){ 1974 v = aa + 49*diag[i] + 49; 1975 vi = aj + diag[i] + 1; 1976 nz = ai[i+1] - diag[i] - 1; 1977 idt = 7*i; 1978 s1 = x[idt]; s2 = x[1+idt]; 1979 s3 = x[2+idt]; s4 = x[3+idt]; 1980 s5 = x[4+idt]; s6 = x[5+idt]; 1981 s7 = x[6+idt]; 1982 while (nz--) { 1983 idx = 7*(*vi++); 1984 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1985 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1986 x7 = x[6+idx]; 1987 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1988 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1989 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1990 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1991 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1992 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1993 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1994 v += 49; 1995 } 1996 v = aa + 49*diag[i]; 1997 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1998 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1999 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2000 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2001 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2002 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2003 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2004 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2005 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2006 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2007 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2008 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2009 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2010 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2011 } 2012 2013 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2014 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2015 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2016 PetscFunctionReturn(0); 2017 } 2018 2019 #undef __FUNCT__ 2020 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 2021 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2022 { 2023 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2024 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2025 PetscErrorCode ierr; 2026 PetscInt idx,jdx,idt; 2027 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2028 const MatScalar *aa=a->a,*v; 2029 PetscScalar *x; 2030 const PetscScalar *b; 2031 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2032 2033 PetscFunctionBegin; 2034 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2035 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2036 /* forward solve the lower triangular */ 2037 idx = 0; 2038 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2039 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 2040 for (i=1; i<n; i++) { 2041 v = aa + bs2*ai[i]; 2042 vi = aj + ai[i]; 2043 nz = ai[i+1] - ai[i]; 2044 idx = bs*i; 2045 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2046 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2047 for(k=0;k<nz;k++) { 2048 jdx = bs*vi[k]; 2049 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2050 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 2051 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2052 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2053 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2054 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2055 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2056 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2057 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2058 v += bs2; 2059 } 2060 2061 x[idx] = s1; 2062 x[1+idx] = s2; 2063 x[2+idx] = s3; 2064 x[3+idx] = s4; 2065 x[4+idx] = s5; 2066 x[5+idx] = s6; 2067 x[6+idx] = s7; 2068 } 2069 2070 /* backward solve the upper triangular */ 2071 for (i=n-1; i>=0; i--){ 2072 v = aa + bs2*(adiag[i+1]+1); 2073 vi = aj + adiag[i+1]+1; 2074 nz = adiag[i] - adiag[i+1]-1; 2075 idt = bs*i; 2076 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2077 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 2078 for(k=0;k<nz;k++) { 2079 idx = bs*vi[k]; 2080 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2081 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 2082 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2083 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2084 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2085 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2086 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2087 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2088 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2089 v += bs2; 2090 } 2091 /* x = inv_diagonal*x */ 2092 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2093 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2094 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2095 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2096 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2097 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2098 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2099 } 2100 2101 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2102 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2103 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2104 PetscFunctionReturn(0); 2105 } 2106 2107 #undef __FUNCT__ 2108 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 2109 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 2110 { 2111 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2112 IS iscol=a->col,isrow=a->row; 2113 PetscErrorCode ierr; 2114 const PetscInt *r,*c,*rout,*cout; 2115 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2116 const MatScalar *aa=a->a,*v; 2117 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2118 const PetscScalar *b; 2119 PetscFunctionBegin; 2120 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2121 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2122 t = a->solve_work; 2123 2124 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2125 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2126 2127 /* forward solve the lower triangular */ 2128 idx = 6*(*r++); 2129 t[0] = b[idx]; t[1] = b[1+idx]; 2130 t[2] = b[2+idx]; t[3] = b[3+idx]; 2131 t[4] = b[4+idx]; t[5] = b[5+idx]; 2132 for (i=1; i<n; i++) { 2133 v = aa + 36*ai[i]; 2134 vi = aj + ai[i]; 2135 nz = diag[i] - ai[i]; 2136 idx = 6*(*r++); 2137 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2138 s5 = b[4+idx]; s6 = b[5+idx]; 2139 while (nz--) { 2140 idx = 6*(*vi++); 2141 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2142 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2143 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2144 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2145 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2146 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2147 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2148 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2149 v += 36; 2150 } 2151 idx = 6*i; 2152 t[idx] = s1;t[1+idx] = s2; 2153 t[2+idx] = s3;t[3+idx] = s4; 2154 t[4+idx] = s5;t[5+idx] = s6; 2155 } 2156 /* backward solve the upper triangular */ 2157 for (i=n-1; i>=0; i--){ 2158 v = aa + 36*diag[i] + 36; 2159 vi = aj + diag[i] + 1; 2160 nz = ai[i+1] - diag[i] - 1; 2161 idt = 6*i; 2162 s1 = t[idt]; s2 = t[1+idt]; 2163 s3 = t[2+idt];s4 = t[3+idt]; 2164 s5 = t[4+idt];s6 = t[5+idt]; 2165 while (nz--) { 2166 idx = 6*(*vi++); 2167 x1 = t[idx]; x2 = t[1+idx]; 2168 x3 = t[2+idx]; x4 = t[3+idx]; 2169 x5 = t[4+idx]; x6 = t[5+idx]; 2170 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2171 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2172 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2173 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2174 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2175 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2176 v += 36; 2177 } 2178 idc = 6*(*c--); 2179 v = aa + 36*diag[i]; 2180 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2181 v[18]*s4+v[24]*s5+v[30]*s6; 2182 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2183 v[19]*s4+v[25]*s5+v[31]*s6; 2184 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2185 v[20]*s4+v[26]*s5+v[32]*s6; 2186 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2187 v[21]*s4+v[27]*s5+v[33]*s6; 2188 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2189 v[22]*s4+v[28]*s5+v[34]*s6; 2190 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2191 v[23]*s4+v[29]*s5+v[35]*s6; 2192 } 2193 2194 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2195 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2196 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2197 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2198 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2199 PetscFunctionReturn(0); 2200 } 2201 2202 #undef __FUNCT__ 2203 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 2204 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 2205 { 2206 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2207 IS iscol=a->col,isrow=a->row; 2208 PetscErrorCode ierr; 2209 const PetscInt *r,*c,*rout,*cout; 2210 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2211 const MatScalar *aa=a->a,*v; 2212 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2213 const PetscScalar *b; 2214 PetscFunctionBegin; 2215 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2216 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2217 t = a->solve_work; 2218 2219 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2220 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2221 2222 /* forward solve the lower triangular */ 2223 idx = 6*r[0]; 2224 t[0] = b[idx]; t[1] = b[1+idx]; 2225 t[2] = b[2+idx]; t[3] = b[3+idx]; 2226 t[4] = b[4+idx]; t[5] = b[5+idx]; 2227 for (i=1; i<n; i++) { 2228 v = aa + 36*ai[i]; 2229 vi = aj + ai[i]; 2230 nz = ai[i+1] - ai[i]; 2231 idx = 6*r[i]; 2232 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2233 s5 = b[4+idx]; s6 = b[5+idx]; 2234 for(m=0;m<nz;m++){ 2235 idx = 6*vi[m]; 2236 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2237 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2238 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2239 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2240 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2241 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2242 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2243 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2244 v += 36; 2245 } 2246 idx = 6*i; 2247 t[idx] = s1;t[1+idx] = s2; 2248 t[2+idx] = s3;t[3+idx] = s4; 2249 t[4+idx] = s5;t[5+idx] = s6; 2250 } 2251 /* backward solve the upper triangular */ 2252 for (i=n-1; i>=0; i--){ 2253 v = aa + 36*(adiag[i+1]+1); 2254 vi = aj + adiag[i+1]+1; 2255 nz = adiag[i] - adiag[i+1] - 1; 2256 idt = 6*i; 2257 s1 = t[idt]; s2 = t[1+idt]; 2258 s3 = t[2+idt];s4 = t[3+idt]; 2259 s5 = t[4+idt];s6 = t[5+idt]; 2260 for(m=0;m<nz;m++){ 2261 idx = 6*vi[m]; 2262 x1 = t[idx]; x2 = t[1+idx]; 2263 x3 = t[2+idx]; x4 = t[3+idx]; 2264 x5 = t[4+idx]; x6 = t[5+idx]; 2265 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2266 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2267 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2268 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2269 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2270 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2271 v += 36; 2272 } 2273 idc = 6*c[i]; 2274 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2275 v[18]*s4+v[24]*s5+v[30]*s6; 2276 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2277 v[19]*s4+v[25]*s5+v[31]*s6; 2278 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2279 v[20]*s4+v[26]*s5+v[32]*s6; 2280 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2281 v[21]*s4+v[27]*s5+v[33]*s6; 2282 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2283 v[22]*s4+v[28]*s5+v[34]*s6; 2284 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2285 v[23]*s4+v[29]*s5+v[35]*s6; 2286 } 2287 2288 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2289 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2290 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2291 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2292 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2293 PetscFunctionReturn(0); 2294 } 2295 2296 #undef __FUNCT__ 2297 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 2298 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 2299 { 2300 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2301 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2302 PetscErrorCode ierr; 2303 PetscInt *diag = a->diag,jdx; 2304 const MatScalar *aa=a->a,*v; 2305 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2306 const PetscScalar *b; 2307 2308 PetscFunctionBegin; 2309 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2310 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2311 /* forward solve the lower triangular */ 2312 idx = 0; 2313 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2314 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2315 for (i=1; i<n; i++) { 2316 v = aa + 36*ai[i]; 2317 vi = aj + ai[i]; 2318 nz = diag[i] - ai[i]; 2319 idx = 6*i; 2320 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2321 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2322 while (nz--) { 2323 jdx = 6*(*vi++); 2324 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2325 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2326 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2327 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2328 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2329 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2330 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2331 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2332 v += 36; 2333 } 2334 x[idx] = s1; 2335 x[1+idx] = s2; 2336 x[2+idx] = s3; 2337 x[3+idx] = s4; 2338 x[4+idx] = s5; 2339 x[5+idx] = s6; 2340 } 2341 /* backward solve the upper triangular */ 2342 for (i=n-1; i>=0; i--){ 2343 v = aa + 36*diag[i] + 36; 2344 vi = aj + diag[i] + 1; 2345 nz = ai[i+1] - diag[i] - 1; 2346 idt = 6*i; 2347 s1 = x[idt]; s2 = x[1+idt]; 2348 s3 = x[2+idt]; s4 = x[3+idt]; 2349 s5 = x[4+idt]; s6 = x[5+idt]; 2350 while (nz--) { 2351 idx = 6*(*vi++); 2352 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2353 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2354 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2355 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2356 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2357 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2358 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2359 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2360 v += 36; 2361 } 2362 v = aa + 36*diag[i]; 2363 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2364 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2365 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2366 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2367 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2368 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2369 } 2370 2371 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2372 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2373 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2374 PetscFunctionReturn(0); 2375 } 2376 2377 #undef __FUNCT__ 2378 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2379 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2380 { 2381 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2382 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2383 PetscErrorCode ierr; 2384 PetscInt idx,jdx,idt; 2385 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2386 const MatScalar *aa=a->a,*v; 2387 PetscScalar *x; 2388 const PetscScalar *b; 2389 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2390 2391 PetscFunctionBegin; 2392 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2393 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2394 /* forward solve the lower triangular */ 2395 idx = 0; 2396 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2397 x[4] = b[4+idx];x[5] = b[5+idx]; 2398 for (i=1; i<n; i++) { 2399 v = aa + bs2*ai[i]; 2400 vi = aj + ai[i]; 2401 nz = ai[i+1] - ai[i]; 2402 idx = bs*i; 2403 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2404 s5 = b[4+idx];s6 = b[5+idx]; 2405 for(k=0;k<nz;k++){ 2406 jdx = bs*vi[k]; 2407 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2408 x5 = x[4+jdx]; x6 = x[5+jdx]; 2409 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2410 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2411 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2412 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2413 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2414 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2415 v += bs2; 2416 } 2417 2418 x[idx] = s1; 2419 x[1+idx] = s2; 2420 x[2+idx] = s3; 2421 x[3+idx] = s4; 2422 x[4+idx] = s5; 2423 x[5+idx] = s6; 2424 } 2425 2426 /* backward solve the upper triangular */ 2427 for (i=n-1; i>=0; i--){ 2428 v = aa + bs2*(adiag[i+1]+1); 2429 vi = aj + adiag[i+1]+1; 2430 nz = adiag[i] - adiag[i+1]-1; 2431 idt = bs*i; 2432 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2433 s5 = x[4+idt];s6 = x[5+idt]; 2434 for(k=0;k<nz;k++){ 2435 idx = bs*vi[k]; 2436 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2437 x5 = x[4+idx];x6 = x[5+idx]; 2438 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2439 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2440 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2441 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2442 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2443 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2444 v += bs2; 2445 } 2446 /* x = inv_diagonal*x */ 2447 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2448 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2449 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2450 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2451 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2452 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2453 } 2454 2455 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2456 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2457 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2458 PetscFunctionReturn(0); 2459 } 2460 2461 #undef __FUNCT__ 2462 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2463 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 2464 { 2465 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2466 IS iscol=a->col,isrow=a->row; 2467 PetscErrorCode ierr; 2468 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 2469 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2470 const MatScalar *aa=a->a,*v; 2471 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2472 const PetscScalar *b; 2473 2474 PetscFunctionBegin; 2475 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2476 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2477 t = a->solve_work; 2478 2479 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2480 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2481 2482 /* forward solve the lower triangular */ 2483 idx = 5*(*r++); 2484 t[0] = b[idx]; t[1] = b[1+idx]; 2485 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2486 for (i=1; i<n; i++) { 2487 v = aa + 25*ai[i]; 2488 vi = aj + ai[i]; 2489 nz = diag[i] - ai[i]; 2490 idx = 5*(*r++); 2491 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2492 s5 = b[4+idx]; 2493 while (nz--) { 2494 idx = 5*(*vi++); 2495 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2496 x4 = t[3+idx];x5 = t[4+idx]; 2497 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2498 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2499 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2500 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2501 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2502 v += 25; 2503 } 2504 idx = 5*i; 2505 t[idx] = s1;t[1+idx] = s2; 2506 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2507 } 2508 /* backward solve the upper triangular */ 2509 for (i=n-1; i>=0; i--){ 2510 v = aa + 25*diag[i] + 25; 2511 vi = aj + diag[i] + 1; 2512 nz = ai[i+1] - diag[i] - 1; 2513 idt = 5*i; 2514 s1 = t[idt]; s2 = t[1+idt]; 2515 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2516 while (nz--) { 2517 idx = 5*(*vi++); 2518 x1 = t[idx]; x2 = t[1+idx]; 2519 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2520 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2521 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2522 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2523 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2524 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2525 v += 25; 2526 } 2527 idc = 5*(*c--); 2528 v = aa + 25*diag[i]; 2529 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2530 v[15]*s4+v[20]*s5; 2531 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2532 v[16]*s4+v[21]*s5; 2533 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2534 v[17]*s4+v[22]*s5; 2535 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2536 v[18]*s4+v[23]*s5; 2537 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2538 v[19]*s4+v[24]*s5; 2539 } 2540 2541 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2542 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2543 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2544 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2545 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2546 PetscFunctionReturn(0); 2547 } 2548 2549 #undef __FUNCT__ 2550 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 2551 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 2552 { 2553 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2554 IS iscol=a->col,isrow=a->row; 2555 PetscErrorCode ierr; 2556 const PetscInt *r,*c,*rout,*cout; 2557 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2558 const MatScalar *aa=a->a,*v; 2559 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 2560 const PetscScalar *b; 2561 2562 PetscFunctionBegin; 2563 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2564 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2565 t = a->solve_work; 2566 2567 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2568 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2569 2570 /* forward solve the lower triangular */ 2571 idx = 5*r[0]; 2572 t[0] = b[idx]; t[1] = b[1+idx]; 2573 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2574 for (i=1; i<n; i++) { 2575 v = aa + 25*ai[i]; 2576 vi = aj + ai[i]; 2577 nz = ai[i+1] - ai[i]; 2578 idx = 5*r[i]; 2579 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2580 s5 = b[4+idx]; 2581 for(m=0;m<nz;m++){ 2582 idx = 5*vi[m]; 2583 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2584 x4 = t[3+idx];x5 = t[4+idx]; 2585 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2586 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2587 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2588 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2589 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2590 v += 25; 2591 } 2592 idx = 5*i; 2593 t[idx] = s1;t[1+idx] = s2; 2594 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2595 } 2596 /* backward solve the upper triangular */ 2597 for (i=n-1; i>=0; i--){ 2598 v = aa + 25*(adiag[i+1]+1); 2599 vi = aj + adiag[i+1]+1; 2600 nz = adiag[i] - adiag[i+1] - 1; 2601 idt = 5*i; 2602 s1 = t[idt]; s2 = t[1+idt]; 2603 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2604 for(m=0;m<nz;m++){ 2605 idx = 5*vi[m]; 2606 x1 = t[idx]; x2 = t[1+idx]; 2607 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2608 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2609 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2610 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2611 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2612 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2613 v += 25; 2614 } 2615 idc = 5*c[i]; 2616 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 2617 v[15]*s4+v[20]*s5; 2618 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 2619 v[16]*s4+v[21]*s5; 2620 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 2621 v[17]*s4+v[22]*s5; 2622 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 2623 v[18]*s4+v[23]*s5; 2624 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 2625 v[19]*s4+v[24]*s5; 2626 } 2627 2628 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2629 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2630 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2631 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2632 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2633 PetscFunctionReturn(0); 2634 } 2635 2636 #undef __FUNCT__ 2637 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 2638 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 2639 { 2640 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2641 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2642 PetscErrorCode ierr; 2643 PetscInt *diag = a->diag,jdx; 2644 const MatScalar *aa=a->a,*v; 2645 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2646 const PetscScalar *b; 2647 2648 PetscFunctionBegin; 2649 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2650 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2651 /* forward solve the lower triangular */ 2652 idx = 0; 2653 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2654 for (i=1; i<n; i++) { 2655 v = aa + 25*ai[i]; 2656 vi = aj + ai[i]; 2657 nz = diag[i] - ai[i]; 2658 idx = 5*i; 2659 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2660 while (nz--) { 2661 jdx = 5*(*vi++); 2662 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2663 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2664 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2665 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2666 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2667 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2668 v += 25; 2669 } 2670 x[idx] = s1; 2671 x[1+idx] = s2; 2672 x[2+idx] = s3; 2673 x[3+idx] = s4; 2674 x[4+idx] = s5; 2675 } 2676 /* backward solve the upper triangular */ 2677 for (i=n-1; i>=0; i--){ 2678 v = aa + 25*diag[i] + 25; 2679 vi = aj + diag[i] + 1; 2680 nz = ai[i+1] - diag[i] - 1; 2681 idt = 5*i; 2682 s1 = x[idt]; s2 = x[1+idt]; 2683 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2684 while (nz--) { 2685 idx = 5*(*vi++); 2686 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2687 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2688 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2689 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2690 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2691 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2692 v += 25; 2693 } 2694 v = aa + 25*diag[i]; 2695 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2696 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2697 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2698 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2699 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2700 } 2701 2702 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2703 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2704 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2705 PetscFunctionReturn(0); 2706 } 2707 2708 #undef __FUNCT__ 2709 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 2710 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2711 { 2712 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2713 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 2714 PetscErrorCode ierr; 2715 PetscInt jdx; 2716 const MatScalar *aa=a->a,*v; 2717 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 2718 const PetscScalar *b; 2719 2720 PetscFunctionBegin; 2721 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2722 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2723 /* forward solve the lower triangular */ 2724 idx = 0; 2725 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 2726 for (i=1; i<n; i++) { 2727 v = aa + 25*ai[i]; 2728 vi = aj + ai[i]; 2729 nz = ai[i+1] - ai[i]; 2730 idx = 5*i; 2731 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 2732 for(k=0;k<nz;k++) { 2733 jdx = 5*vi[k]; 2734 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 2735 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2736 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2737 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2738 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2739 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2740 v += 25; 2741 } 2742 x[idx] = s1; 2743 x[1+idx] = s2; 2744 x[2+idx] = s3; 2745 x[3+idx] = s4; 2746 x[4+idx] = s5; 2747 } 2748 2749 /* backward solve the upper triangular */ 2750 for (i=n-1; i>=0; i--){ 2751 v = aa + 25*(adiag[i+1]+1); 2752 vi = aj + adiag[i+1]+1; 2753 nz = adiag[i] - adiag[i+1]-1; 2754 idt = 5*i; 2755 s1 = x[idt]; s2 = x[1+idt]; 2756 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 2757 for(k=0;k<nz;k++){ 2758 idx = 5*vi[k]; 2759 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 2760 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 2761 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 2762 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 2763 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 2764 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 2765 v += 25; 2766 } 2767 /* x = inv_diagonal*x */ 2768 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 2769 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 2770 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 2771 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 2772 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 2773 } 2774 2775 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2776 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2777 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 2778 PetscFunctionReturn(0); 2779 } 2780 2781 #undef __FUNCT__ 2782 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 2783 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 2784 { 2785 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2786 IS iscol=a->col,isrow=a->row; 2787 PetscErrorCode ierr; 2788 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2789 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2790 const MatScalar *aa=a->a,*v; 2791 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2792 const PetscScalar *b; 2793 2794 PetscFunctionBegin; 2795 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2796 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2797 t = a->solve_work; 2798 2799 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2800 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2801 2802 /* forward solve the lower triangular */ 2803 idx = 4*(*r++); 2804 t[0] = b[idx]; t[1] = b[1+idx]; 2805 t[2] = b[2+idx]; t[3] = b[3+idx]; 2806 for (i=1; i<n; i++) { 2807 v = aa + 16*ai[i]; 2808 vi = aj + ai[i]; 2809 nz = diag[i] - ai[i]; 2810 idx = 4*(*r++); 2811 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2812 while (nz--) { 2813 idx = 4*(*vi++); 2814 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2815 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2816 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2817 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2818 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2819 v += 16; 2820 } 2821 idx = 4*i; 2822 t[idx] = s1;t[1+idx] = s2; 2823 t[2+idx] = s3;t[3+idx] = s4; 2824 } 2825 /* backward solve the upper triangular */ 2826 for (i=n-1; i>=0; i--){ 2827 v = aa + 16*diag[i] + 16; 2828 vi = aj + diag[i] + 1; 2829 nz = ai[i+1] - diag[i] - 1; 2830 idt = 4*i; 2831 s1 = t[idt]; s2 = t[1+idt]; 2832 s3 = t[2+idt];s4 = t[3+idt]; 2833 while (nz--) { 2834 idx = 4*(*vi++); 2835 x1 = t[idx]; x2 = t[1+idx]; 2836 x3 = t[2+idx]; x4 = t[3+idx]; 2837 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2838 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2839 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2840 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2841 v += 16; 2842 } 2843 idc = 4*(*c--); 2844 v = aa + 16*diag[i]; 2845 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2846 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2847 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2848 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2849 } 2850 2851 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2852 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2853 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2854 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2855 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2856 PetscFunctionReturn(0); 2857 } 2858 2859 #undef __FUNCT__ 2860 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 2861 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 2862 { 2863 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2864 IS iscol=a->col,isrow=a->row; 2865 PetscErrorCode ierr; 2866 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2867 const PetscInt *r,*c,*rout,*cout; 2868 const MatScalar *aa=a->a,*v; 2869 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 2870 const PetscScalar *b; 2871 2872 PetscFunctionBegin; 2873 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2874 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2875 t = a->solve_work; 2876 2877 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2878 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2879 2880 /* forward solve the lower triangular */ 2881 idx = 4*r[0]; 2882 t[0] = b[idx]; t[1] = b[1+idx]; 2883 t[2] = b[2+idx]; t[3] = b[3+idx]; 2884 for (i=1; i<n; i++) { 2885 v = aa + 16*ai[i]; 2886 vi = aj + ai[i]; 2887 nz = ai[i+1] - ai[i]; 2888 idx = 4*r[i]; 2889 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2890 for(m=0;m<nz;m++){ 2891 idx = 4*vi[m]; 2892 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 2893 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2894 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2895 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2896 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2897 v += 16; 2898 } 2899 idx = 4*i; 2900 t[idx] = s1;t[1+idx] = s2; 2901 t[2+idx] = s3;t[3+idx] = s4; 2902 } 2903 /* backward solve the upper triangular */ 2904 for (i=n-1; i>=0; i--){ 2905 v = aa + 16*(adiag[i+1]+1); 2906 vi = aj + adiag[i+1]+1; 2907 nz = adiag[i] - adiag[i+1] - 1; 2908 idt = 4*i; 2909 s1 = t[idt]; s2 = t[1+idt]; 2910 s3 = t[2+idt];s4 = t[3+idt]; 2911 for(m=0;m<nz;m++){ 2912 idx = 4*vi[m]; 2913 x1 = t[idx]; x2 = t[1+idx]; 2914 x3 = t[2+idx]; x4 = t[3+idx]; 2915 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2916 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2917 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2918 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2919 v += 16; 2920 } 2921 idc = 4*c[i]; 2922 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 2923 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 2924 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 2925 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 2926 } 2927 2928 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2929 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2930 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2931 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2932 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2933 PetscFunctionReturn(0); 2934 } 2935 2936 #undef __FUNCT__ 2937 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 2938 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 2939 { 2940 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2941 IS iscol=a->col,isrow=a->row; 2942 PetscErrorCode ierr; 2943 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2944 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2945 const MatScalar *aa=a->a,*v; 2946 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 2947 PetscScalar *x; 2948 const PetscScalar *b; 2949 2950 PetscFunctionBegin; 2951 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2952 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2953 t = (MatScalar *)a->solve_work; 2954 2955 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2956 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2957 2958 /* forward solve the lower triangular */ 2959 idx = 4*(*r++); 2960 t[0] = (MatScalar)b[idx]; 2961 t[1] = (MatScalar)b[1+idx]; 2962 t[2] = (MatScalar)b[2+idx]; 2963 t[3] = (MatScalar)b[3+idx]; 2964 for (i=1; i<n; i++) { 2965 v = aa + 16*ai[i]; 2966 vi = aj + ai[i]; 2967 nz = diag[i] - ai[i]; 2968 idx = 4*(*r++); 2969 s1 = (MatScalar)b[idx]; 2970 s2 = (MatScalar)b[1+idx]; 2971 s3 = (MatScalar)b[2+idx]; 2972 s4 = (MatScalar)b[3+idx]; 2973 while (nz--) { 2974 idx = 4*(*vi++); 2975 x1 = t[idx]; 2976 x2 = t[1+idx]; 2977 x3 = t[2+idx]; 2978 x4 = t[3+idx]; 2979 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2980 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2981 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2982 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2983 v += 16; 2984 } 2985 idx = 4*i; 2986 t[idx] = s1; 2987 t[1+idx] = s2; 2988 t[2+idx] = s3; 2989 t[3+idx] = s4; 2990 } 2991 /* backward solve the upper triangular */ 2992 for (i=n-1; i>=0; i--){ 2993 v = aa + 16*diag[i] + 16; 2994 vi = aj + diag[i] + 1; 2995 nz = ai[i+1] - diag[i] - 1; 2996 idt = 4*i; 2997 s1 = t[idt]; 2998 s2 = t[1+idt]; 2999 s3 = t[2+idt]; 3000 s4 = t[3+idt]; 3001 while (nz--) { 3002 idx = 4*(*vi++); 3003 x1 = t[idx]; 3004 x2 = t[1+idx]; 3005 x3 = t[2+idx]; 3006 x4 = t[3+idx]; 3007 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3008 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3009 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3010 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3011 v += 16; 3012 } 3013 idc = 4*(*c--); 3014 v = aa + 16*diag[i]; 3015 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3016 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3017 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3018 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3019 x[idc] = (PetscScalar)t[idt]; 3020 x[1+idc] = (PetscScalar)t[1+idt]; 3021 x[2+idc] = (PetscScalar)t[2+idt]; 3022 x[3+idc] = (PetscScalar)t[3+idt]; 3023 } 3024 3025 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3026 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3027 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3028 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3029 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3030 PetscFunctionReturn(0); 3031 } 3032 3033 #if defined (PETSC_HAVE_SSE) 3034 3035 #include PETSC_HAVE_SSE 3036 3037 #undef __FUNCT__ 3038 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3039 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3040 { 3041 /* 3042 Note: This code uses demotion of double 3043 to float when performing the mixed-mode computation. 3044 This may not be numerically reasonable for all applications. 3045 */ 3046 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3047 IS iscol=a->col,isrow=a->row; 3048 PetscErrorCode ierr; 3049 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3050 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3051 MatScalar *aa=a->a,*v; 3052 PetscScalar *x,*b,*t; 3053 3054 /* Make space in temp stack for 16 Byte Aligned arrays */ 3055 float ssealignedspace[11],*tmps,*tmpx; 3056 unsigned long offset; 3057 3058 PetscFunctionBegin; 3059 SSE_SCOPE_BEGIN; 3060 3061 offset = (unsigned long)ssealignedspace % 16; 3062 if (offset) offset = (16 - offset)/4; 3063 tmps = &ssealignedspace[offset]; 3064 tmpx = &ssealignedspace[offset+4]; 3065 PREFETCH_NTA(aa+16*ai[1]); 3066 3067 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3068 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3069 t = a->solve_work; 3070 3071 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3072 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3073 3074 /* forward solve the lower triangular */ 3075 idx = 4*(*r++); 3076 t[0] = b[idx]; t[1] = b[1+idx]; 3077 t[2] = b[2+idx]; t[3] = b[3+idx]; 3078 v = aa + 16*ai[1]; 3079 3080 for (i=1; i<n;) { 3081 PREFETCH_NTA(&v[8]); 3082 vi = aj + ai[i]; 3083 nz = diag[i] - ai[i]; 3084 idx = 4*(*r++); 3085 3086 /* Demote sum from double to float */ 3087 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3088 LOAD_PS(tmps,XMM7); 3089 3090 while (nz--) { 3091 PREFETCH_NTA(&v[16]); 3092 idx = 4*(*vi++); 3093 3094 /* Demote solution (so far) from double to float */ 3095 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3096 3097 /* 4x4 Matrix-Vector product with negative accumulation: */ 3098 SSE_INLINE_BEGIN_2(tmpx,v) 3099 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3100 3101 /* First Column */ 3102 SSE_COPY_PS(XMM0,XMM6) 3103 SSE_SHUFFLE(XMM0,XMM0,0x00) 3104 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3105 SSE_SUB_PS(XMM7,XMM0) 3106 3107 /* Second Column */ 3108 SSE_COPY_PS(XMM1,XMM6) 3109 SSE_SHUFFLE(XMM1,XMM1,0x55) 3110 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3111 SSE_SUB_PS(XMM7,XMM1) 3112 3113 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3114 3115 /* Third Column */ 3116 SSE_COPY_PS(XMM2,XMM6) 3117 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3118 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3119 SSE_SUB_PS(XMM7,XMM2) 3120 3121 /* Fourth Column */ 3122 SSE_COPY_PS(XMM3,XMM6) 3123 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3124 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3125 SSE_SUB_PS(XMM7,XMM3) 3126 SSE_INLINE_END_2 3127 3128 v += 16; 3129 } 3130 idx = 4*i; 3131 v = aa + 16*ai[++i]; 3132 PREFETCH_NTA(v); 3133 STORE_PS(tmps,XMM7); 3134 3135 /* Promote result from float to double */ 3136 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3137 } 3138 /* backward solve the upper triangular */ 3139 idt = 4*(n-1); 3140 ai16 = 16*diag[n-1]; 3141 v = aa + ai16 + 16; 3142 for (i=n-1; i>=0;){ 3143 PREFETCH_NTA(&v[8]); 3144 vi = aj + diag[i] + 1; 3145 nz = ai[i+1] - diag[i] - 1; 3146 3147 /* Demote accumulator from double to float */ 3148 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3149 LOAD_PS(tmps,XMM7); 3150 3151 while (nz--) { 3152 PREFETCH_NTA(&v[16]); 3153 idx = 4*(*vi++); 3154 3155 /* Demote solution (so far) from double to float */ 3156 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 3157 3158 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3159 SSE_INLINE_BEGIN_2(tmpx,v) 3160 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3161 3162 /* First Column */ 3163 SSE_COPY_PS(XMM0,XMM6) 3164 SSE_SHUFFLE(XMM0,XMM0,0x00) 3165 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3166 SSE_SUB_PS(XMM7,XMM0) 3167 3168 /* Second Column */ 3169 SSE_COPY_PS(XMM1,XMM6) 3170 SSE_SHUFFLE(XMM1,XMM1,0x55) 3171 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3172 SSE_SUB_PS(XMM7,XMM1) 3173 3174 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3175 3176 /* Third Column */ 3177 SSE_COPY_PS(XMM2,XMM6) 3178 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3179 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3180 SSE_SUB_PS(XMM7,XMM2) 3181 3182 /* Fourth Column */ 3183 SSE_COPY_PS(XMM3,XMM6) 3184 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3185 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3186 SSE_SUB_PS(XMM7,XMM3) 3187 SSE_INLINE_END_2 3188 v += 16; 3189 } 3190 v = aa + ai16; 3191 ai16 = 16*diag[--i]; 3192 PREFETCH_NTA(aa+ai16+16); 3193 /* 3194 Scale the result by the diagonal 4x4 block, 3195 which was inverted as part of the factorization 3196 */ 3197 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 3198 /* First Column */ 3199 SSE_COPY_PS(XMM0,XMM7) 3200 SSE_SHUFFLE(XMM0,XMM0,0x00) 3201 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3202 3203 /* Second Column */ 3204 SSE_COPY_PS(XMM1,XMM7) 3205 SSE_SHUFFLE(XMM1,XMM1,0x55) 3206 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3207 SSE_ADD_PS(XMM0,XMM1) 3208 3209 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3210 3211 /* Third Column */ 3212 SSE_COPY_PS(XMM2,XMM7) 3213 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3214 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3215 SSE_ADD_PS(XMM0,XMM2) 3216 3217 /* Fourth Column */ 3218 SSE_COPY_PS(XMM3,XMM7) 3219 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3220 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3221 SSE_ADD_PS(XMM0,XMM3) 3222 3223 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3224 SSE_INLINE_END_3 3225 3226 /* Promote solution from float to double */ 3227 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 3228 3229 /* Apply reordering to t and stream into x. */ 3230 /* This way, x doesn't pollute the cache. */ 3231 /* Be careful with size: 2 doubles = 4 floats! */ 3232 idc = 4*(*c--); 3233 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 3234 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 3235 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 3236 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 3237 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 3238 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 3239 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 3240 SSE_INLINE_END_2 3241 v = aa + ai16 + 16; 3242 idt -= 4; 3243 } 3244 3245 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3246 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3247 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3248 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3249 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3250 SSE_SCOPE_END; 3251 PetscFunctionReturn(0); 3252 } 3253 3254 #endif 3255 3256 3257 /* 3258 Special case where the matrix was ILU(0) factored in the natural 3259 ordering. This eliminates the need for the column and row permutation. 3260 */ 3261 #undef __FUNCT__ 3262 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 3263 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 3264 { 3265 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3266 PetscInt n=a->mbs; 3267 const PetscInt *ai=a->i,*aj=a->j; 3268 PetscErrorCode ierr; 3269 const PetscInt *diag = a->diag; 3270 const MatScalar *aa=a->a; 3271 PetscScalar *x; 3272 const PetscScalar *b; 3273 3274 PetscFunctionBegin; 3275 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3276 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3277 3278 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 3279 { 3280 static PetscScalar w[2000]; /* very BAD need to fix */ 3281 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 3282 } 3283 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 3284 { 3285 static PetscScalar w[2000]; /* very BAD need to fix */ 3286 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 3287 } 3288 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 3289 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3290 #else 3291 { 3292 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3293 const MatScalar *v; 3294 PetscInt jdx,idt,idx,nz,i,ai16; 3295 const PetscInt *vi; 3296 3297 /* forward solve the lower triangular */ 3298 idx = 0; 3299 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 3300 for (i=1; i<n; i++) { 3301 v = aa + 16*ai[i]; 3302 vi = aj + ai[i]; 3303 nz = diag[i] - ai[i]; 3304 idx += 4; 3305 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3306 while (nz--) { 3307 jdx = 4*(*vi++); 3308 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3309 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3310 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3311 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3312 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3313 v += 16; 3314 } 3315 x[idx] = s1; 3316 x[1+idx] = s2; 3317 x[2+idx] = s3; 3318 x[3+idx] = s4; 3319 } 3320 /* backward solve the upper triangular */ 3321 idt = 4*(n-1); 3322 for (i=n-1; i>=0; i--){ 3323 ai16 = 16*diag[i]; 3324 v = aa + ai16 + 16; 3325 vi = aj + diag[i] + 1; 3326 nz = ai[i+1] - diag[i] - 1; 3327 s1 = x[idt]; s2 = x[1+idt]; 3328 s3 = x[2+idt];s4 = x[3+idt]; 3329 while (nz--) { 3330 idx = 4*(*vi++); 3331 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3332 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3333 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3334 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3335 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3336 v += 16; 3337 } 3338 v = aa + ai16; 3339 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3340 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3341 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3342 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3343 idt -= 4; 3344 } 3345 } 3346 #endif 3347 3348 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3349 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3350 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3351 PetscFunctionReturn(0); 3352 } 3353 3354 #undef __FUNCT__ 3355 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3356 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3357 { 3358 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3359 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3360 PetscErrorCode ierr; 3361 PetscInt idx,jdx,idt; 3362 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3363 const MatScalar *aa=a->a,*v; 3364 PetscScalar *x; 3365 const PetscScalar *b; 3366 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3367 3368 PetscFunctionBegin; 3369 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3370 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3371 /* forward solve the lower triangular */ 3372 idx = 0; 3373 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3374 for (i=1; i<n; i++) { 3375 v = aa + bs2*ai[i]; 3376 vi = aj + ai[i]; 3377 nz = ai[i+1] - ai[i]; 3378 idx = bs*i; 3379 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3380 for(k=0;k<nz;k++) { 3381 jdx = bs*vi[k]; 3382 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3383 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3384 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3385 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3386 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3387 3388 v += bs2; 3389 } 3390 3391 x[idx] = s1; 3392 x[1+idx] = s2; 3393 x[2+idx] = s3; 3394 x[3+idx] = s4; 3395 } 3396 3397 /* backward solve the upper triangular */ 3398 for (i=n-1; i>=0; i--){ 3399 v = aa + bs2*(adiag[i+1]+1); 3400 vi = aj + adiag[i+1]+1; 3401 nz = adiag[i] - adiag[i+1]-1; 3402 idt = bs*i; 3403 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3404 3405 for(k=0;k<nz;k++){ 3406 idx = bs*vi[k]; 3407 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3408 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3409 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3410 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3411 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3412 3413 v += bs2; 3414 } 3415 /* x = inv_diagonal*x */ 3416 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3417 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3418 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3419 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3420 3421 } 3422 3423 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3424 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3425 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3426 PetscFunctionReturn(0); 3427 } 3428 3429 #undef __FUNCT__ 3430 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3431 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3432 { 3433 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3434 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3435 PetscErrorCode ierr; 3436 PetscInt *diag = a->diag; 3437 MatScalar *aa=a->a; 3438 PetscScalar *x,*b; 3439 3440 PetscFunctionBegin; 3441 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3442 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3443 3444 { 3445 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3446 MatScalar *v,*t=(MatScalar *)x; 3447 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3448 3449 /* forward solve the lower triangular */ 3450 idx = 0; 3451 t[0] = (MatScalar)b[0]; 3452 t[1] = (MatScalar)b[1]; 3453 t[2] = (MatScalar)b[2]; 3454 t[3] = (MatScalar)b[3]; 3455 for (i=1; i<n; i++) { 3456 v = aa + 16*ai[i]; 3457 vi = aj + ai[i]; 3458 nz = diag[i] - ai[i]; 3459 idx += 4; 3460 s1 = (MatScalar)b[idx]; 3461 s2 = (MatScalar)b[1+idx]; 3462 s3 = (MatScalar)b[2+idx]; 3463 s4 = (MatScalar)b[3+idx]; 3464 while (nz--) { 3465 jdx = 4*(*vi++); 3466 x1 = t[jdx]; 3467 x2 = t[1+jdx]; 3468 x3 = t[2+jdx]; 3469 x4 = t[3+jdx]; 3470 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3471 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3472 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3473 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3474 v += 16; 3475 } 3476 t[idx] = s1; 3477 t[1+idx] = s2; 3478 t[2+idx] = s3; 3479 t[3+idx] = s4; 3480 } 3481 /* backward solve the upper triangular */ 3482 idt = 4*(n-1); 3483 for (i=n-1; i>=0; i--){ 3484 ai16 = 16*diag[i]; 3485 v = aa + ai16 + 16; 3486 vi = aj + diag[i] + 1; 3487 nz = ai[i+1] - diag[i] - 1; 3488 s1 = t[idt]; 3489 s2 = t[1+idt]; 3490 s3 = t[2+idt]; 3491 s4 = t[3+idt]; 3492 while (nz--) { 3493 idx = 4*(*vi++); 3494 x1 = (MatScalar)x[idx]; 3495 x2 = (MatScalar)x[1+idx]; 3496 x3 = (MatScalar)x[2+idx]; 3497 x4 = (MatScalar)x[3+idx]; 3498 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3499 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3500 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3501 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3502 v += 16; 3503 } 3504 v = aa + ai16; 3505 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 3506 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 3507 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 3508 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 3509 idt -= 4; 3510 } 3511 } 3512 3513 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3514 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3515 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3516 PetscFunctionReturn(0); 3517 } 3518 3519 #if defined (PETSC_HAVE_SSE) 3520 3521 #include PETSC_HAVE_SSE 3522 #undef __FUNCT__ 3523 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 3524 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 3525 { 3526 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3527 unsigned short *aj=(unsigned short *)a->j; 3528 PetscErrorCode ierr; 3529 int *ai=a->i,n=a->mbs,*diag = a->diag; 3530 MatScalar *aa=a->a; 3531 PetscScalar *x,*b; 3532 3533 PetscFunctionBegin; 3534 SSE_SCOPE_BEGIN; 3535 /* 3536 Note: This code currently uses demotion of double 3537 to float when performing the mixed-mode computation. 3538 This may not be numerically reasonable for all applications. 3539 */ 3540 PREFETCH_NTA(aa+16*ai[1]); 3541 3542 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3543 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3544 { 3545 /* x will first be computed in single precision then promoted inplace to double */ 3546 MatScalar *v,*t=(MatScalar *)x; 3547 int nz,i,idt,ai16; 3548 unsigned int jdx,idx; 3549 unsigned short *vi; 3550 /* Forward solve the lower triangular factor. */ 3551 3552 /* First block is the identity. */ 3553 idx = 0; 3554 CONVERT_DOUBLE4_FLOAT4(t,b); 3555 v = aa + 16*((unsigned int)ai[1]); 3556 3557 for (i=1; i<n;) { 3558 PREFETCH_NTA(&v[8]); 3559 vi = aj + ai[i]; 3560 nz = diag[i] - ai[i]; 3561 idx += 4; 3562 3563 /* Demote RHS from double to float. */ 3564 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3565 LOAD_PS(&t[idx],XMM7); 3566 3567 while (nz--) { 3568 PREFETCH_NTA(&v[16]); 3569 jdx = 4*((unsigned int)(*vi++)); 3570 3571 /* 4x4 Matrix-Vector product with negative accumulation: */ 3572 SSE_INLINE_BEGIN_2(&t[jdx],v) 3573 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3574 3575 /* First Column */ 3576 SSE_COPY_PS(XMM0,XMM6) 3577 SSE_SHUFFLE(XMM0,XMM0,0x00) 3578 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3579 SSE_SUB_PS(XMM7,XMM0) 3580 3581 /* Second Column */ 3582 SSE_COPY_PS(XMM1,XMM6) 3583 SSE_SHUFFLE(XMM1,XMM1,0x55) 3584 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3585 SSE_SUB_PS(XMM7,XMM1) 3586 3587 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3588 3589 /* Third Column */ 3590 SSE_COPY_PS(XMM2,XMM6) 3591 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3592 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3593 SSE_SUB_PS(XMM7,XMM2) 3594 3595 /* Fourth Column */ 3596 SSE_COPY_PS(XMM3,XMM6) 3597 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3598 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3599 SSE_SUB_PS(XMM7,XMM3) 3600 SSE_INLINE_END_2 3601 3602 v += 16; 3603 } 3604 v = aa + 16*ai[++i]; 3605 PREFETCH_NTA(v); 3606 STORE_PS(&t[idx],XMM7); 3607 } 3608 3609 /* Backward solve the upper triangular factor.*/ 3610 3611 idt = 4*(n-1); 3612 ai16 = 16*diag[n-1]; 3613 v = aa + ai16 + 16; 3614 for (i=n-1; i>=0;){ 3615 PREFETCH_NTA(&v[8]); 3616 vi = aj + diag[i] + 1; 3617 nz = ai[i+1] - diag[i] - 1; 3618 3619 LOAD_PS(&t[idt],XMM7); 3620 3621 while (nz--) { 3622 PREFETCH_NTA(&v[16]); 3623 idx = 4*((unsigned int)(*vi++)); 3624 3625 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3626 SSE_INLINE_BEGIN_2(&t[idx],v) 3627 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3628 3629 /* First Column */ 3630 SSE_COPY_PS(XMM0,XMM6) 3631 SSE_SHUFFLE(XMM0,XMM0,0x00) 3632 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3633 SSE_SUB_PS(XMM7,XMM0) 3634 3635 /* Second Column */ 3636 SSE_COPY_PS(XMM1,XMM6) 3637 SSE_SHUFFLE(XMM1,XMM1,0x55) 3638 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3639 SSE_SUB_PS(XMM7,XMM1) 3640 3641 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3642 3643 /* Third Column */ 3644 SSE_COPY_PS(XMM2,XMM6) 3645 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3646 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3647 SSE_SUB_PS(XMM7,XMM2) 3648 3649 /* Fourth Column */ 3650 SSE_COPY_PS(XMM3,XMM6) 3651 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3652 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3653 SSE_SUB_PS(XMM7,XMM3) 3654 SSE_INLINE_END_2 3655 v += 16; 3656 } 3657 v = aa + ai16; 3658 ai16 = 16*diag[--i]; 3659 PREFETCH_NTA(aa+ai16+16); 3660 /* 3661 Scale the result by the diagonal 4x4 block, 3662 which was inverted as part of the factorization 3663 */ 3664 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3665 /* First Column */ 3666 SSE_COPY_PS(XMM0,XMM7) 3667 SSE_SHUFFLE(XMM0,XMM0,0x00) 3668 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3669 3670 /* Second Column */ 3671 SSE_COPY_PS(XMM1,XMM7) 3672 SSE_SHUFFLE(XMM1,XMM1,0x55) 3673 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3674 SSE_ADD_PS(XMM0,XMM1) 3675 3676 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3677 3678 /* Third Column */ 3679 SSE_COPY_PS(XMM2,XMM7) 3680 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3681 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3682 SSE_ADD_PS(XMM0,XMM2) 3683 3684 /* Fourth Column */ 3685 SSE_COPY_PS(XMM3,XMM7) 3686 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3687 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3688 SSE_ADD_PS(XMM0,XMM3) 3689 3690 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3691 SSE_INLINE_END_3 3692 3693 v = aa + ai16 + 16; 3694 idt -= 4; 3695 } 3696 3697 /* Convert t from single precision back to double precision (inplace)*/ 3698 idt = 4*(n-1); 3699 for (i=n-1;i>=0;i--) { 3700 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3701 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3702 PetscScalar *xtemp=&x[idt]; 3703 MatScalar *ttemp=&t[idt]; 3704 xtemp[3] = (PetscScalar)ttemp[3]; 3705 xtemp[2] = (PetscScalar)ttemp[2]; 3706 xtemp[1] = (PetscScalar)ttemp[1]; 3707 xtemp[0] = (PetscScalar)ttemp[0]; 3708 idt -= 4; 3709 } 3710 3711 } /* End of artificial scope. */ 3712 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3713 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3714 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3715 SSE_SCOPE_END; 3716 PetscFunctionReturn(0); 3717 } 3718 3719 #undef __FUNCT__ 3720 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 3721 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 3722 { 3723 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3724 int *aj=a->j; 3725 PetscErrorCode ierr; 3726 int *ai=a->i,n=a->mbs,*diag = a->diag; 3727 MatScalar *aa=a->a; 3728 PetscScalar *x,*b; 3729 3730 PetscFunctionBegin; 3731 SSE_SCOPE_BEGIN; 3732 /* 3733 Note: This code currently uses demotion of double 3734 to float when performing the mixed-mode computation. 3735 This may not be numerically reasonable for all applications. 3736 */ 3737 PREFETCH_NTA(aa+16*ai[1]); 3738 3739 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3740 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3741 { 3742 /* x will first be computed in single precision then promoted inplace to double */ 3743 MatScalar *v,*t=(MatScalar *)x; 3744 int nz,i,idt,ai16; 3745 int jdx,idx; 3746 int *vi; 3747 /* Forward solve the lower triangular factor. */ 3748 3749 /* First block is the identity. */ 3750 idx = 0; 3751 CONVERT_DOUBLE4_FLOAT4(t,b); 3752 v = aa + 16*ai[1]; 3753 3754 for (i=1; i<n;) { 3755 PREFETCH_NTA(&v[8]); 3756 vi = aj + ai[i]; 3757 nz = diag[i] - ai[i]; 3758 idx += 4; 3759 3760 /* Demote RHS from double to float. */ 3761 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 3762 LOAD_PS(&t[idx],XMM7); 3763 3764 while (nz--) { 3765 PREFETCH_NTA(&v[16]); 3766 jdx = 4*(*vi++); 3767 /* jdx = *vi++; */ 3768 3769 /* 4x4 Matrix-Vector product with negative accumulation: */ 3770 SSE_INLINE_BEGIN_2(&t[jdx],v) 3771 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3772 3773 /* First Column */ 3774 SSE_COPY_PS(XMM0,XMM6) 3775 SSE_SHUFFLE(XMM0,XMM0,0x00) 3776 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3777 SSE_SUB_PS(XMM7,XMM0) 3778 3779 /* Second Column */ 3780 SSE_COPY_PS(XMM1,XMM6) 3781 SSE_SHUFFLE(XMM1,XMM1,0x55) 3782 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3783 SSE_SUB_PS(XMM7,XMM1) 3784 3785 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3786 3787 /* Third Column */ 3788 SSE_COPY_PS(XMM2,XMM6) 3789 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3790 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3791 SSE_SUB_PS(XMM7,XMM2) 3792 3793 /* Fourth Column */ 3794 SSE_COPY_PS(XMM3,XMM6) 3795 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3796 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3797 SSE_SUB_PS(XMM7,XMM3) 3798 SSE_INLINE_END_2 3799 3800 v += 16; 3801 } 3802 v = aa + 16*ai[++i]; 3803 PREFETCH_NTA(v); 3804 STORE_PS(&t[idx],XMM7); 3805 } 3806 3807 /* Backward solve the upper triangular factor.*/ 3808 3809 idt = 4*(n-1); 3810 ai16 = 16*diag[n-1]; 3811 v = aa + ai16 + 16; 3812 for (i=n-1; i>=0;){ 3813 PREFETCH_NTA(&v[8]); 3814 vi = aj + diag[i] + 1; 3815 nz = ai[i+1] - diag[i] - 1; 3816 3817 LOAD_PS(&t[idt],XMM7); 3818 3819 while (nz--) { 3820 PREFETCH_NTA(&v[16]); 3821 idx = 4*(*vi++); 3822 /* idx = *vi++; */ 3823 3824 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3825 SSE_INLINE_BEGIN_2(&t[idx],v) 3826 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3827 3828 /* First Column */ 3829 SSE_COPY_PS(XMM0,XMM6) 3830 SSE_SHUFFLE(XMM0,XMM0,0x00) 3831 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3832 SSE_SUB_PS(XMM7,XMM0) 3833 3834 /* Second Column */ 3835 SSE_COPY_PS(XMM1,XMM6) 3836 SSE_SHUFFLE(XMM1,XMM1,0x55) 3837 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3838 SSE_SUB_PS(XMM7,XMM1) 3839 3840 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3841 3842 /* Third Column */ 3843 SSE_COPY_PS(XMM2,XMM6) 3844 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3845 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3846 SSE_SUB_PS(XMM7,XMM2) 3847 3848 /* Fourth Column */ 3849 SSE_COPY_PS(XMM3,XMM6) 3850 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3851 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3852 SSE_SUB_PS(XMM7,XMM3) 3853 SSE_INLINE_END_2 3854 v += 16; 3855 } 3856 v = aa + ai16; 3857 ai16 = 16*diag[--i]; 3858 PREFETCH_NTA(aa+ai16+16); 3859 /* 3860 Scale the result by the diagonal 4x4 block, 3861 which was inverted as part of the factorization 3862 */ 3863 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 3864 /* First Column */ 3865 SSE_COPY_PS(XMM0,XMM7) 3866 SSE_SHUFFLE(XMM0,XMM0,0x00) 3867 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3868 3869 /* Second Column */ 3870 SSE_COPY_PS(XMM1,XMM7) 3871 SSE_SHUFFLE(XMM1,XMM1,0x55) 3872 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3873 SSE_ADD_PS(XMM0,XMM1) 3874 3875 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3876 3877 /* Third Column */ 3878 SSE_COPY_PS(XMM2,XMM7) 3879 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3880 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3881 SSE_ADD_PS(XMM0,XMM2) 3882 3883 /* Fourth Column */ 3884 SSE_COPY_PS(XMM3,XMM7) 3885 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3886 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3887 SSE_ADD_PS(XMM0,XMM3) 3888 3889 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3890 SSE_INLINE_END_3 3891 3892 v = aa + ai16 + 16; 3893 idt -= 4; 3894 } 3895 3896 /* Convert t from single precision back to double precision (inplace)*/ 3897 idt = 4*(n-1); 3898 for (i=n-1;i>=0;i--) { 3899 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 3900 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 3901 PetscScalar *xtemp=&x[idt]; 3902 MatScalar *ttemp=&t[idt]; 3903 xtemp[3] = (PetscScalar)ttemp[3]; 3904 xtemp[2] = (PetscScalar)ttemp[2]; 3905 xtemp[1] = (PetscScalar)ttemp[1]; 3906 xtemp[0] = (PetscScalar)ttemp[0]; 3907 idt -= 4; 3908 } 3909 3910 } /* End of artificial scope. */ 3911 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3912 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3913 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3914 SSE_SCOPE_END; 3915 PetscFunctionReturn(0); 3916 } 3917 3918 #endif 3919 3920 #undef __FUNCT__ 3921 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 3922 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 3923 { 3924 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3925 IS iscol=a->col,isrow=a->row; 3926 PetscErrorCode ierr; 3927 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3928 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3929 const MatScalar *aa=a->a,*v; 3930 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 3931 const PetscScalar *b; 3932 3933 PetscFunctionBegin; 3934 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3935 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3936 t = a->solve_work; 3937 3938 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3939 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3940 3941 /* forward solve the lower triangular */ 3942 idx = 3*(*r++); 3943 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 3944 for (i=1; i<n; i++) { 3945 v = aa + 9*ai[i]; 3946 vi = aj + ai[i]; 3947 nz = diag[i] - ai[i]; 3948 idx = 3*(*r++); 3949 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 3950 while (nz--) { 3951 idx = 3*(*vi++); 3952 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3953 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3954 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3955 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3956 v += 9; 3957 } 3958 idx = 3*i; 3959 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 3960 } 3961 /* backward solve the upper triangular */ 3962 for (i=n-1; i>=0; i--){ 3963 v = aa + 9*diag[i] + 9; 3964 vi = aj + diag[i] + 1; 3965 nz = ai[i+1] - diag[i] - 1; 3966 idt = 3*i; 3967 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 3968 while (nz--) { 3969 idx = 3*(*vi++); 3970 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 3971 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 3972 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 3973 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 3974 v += 9; 3975 } 3976 idc = 3*(*c--); 3977 v = aa + 9*diag[i]; 3978 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 3979 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 3980 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 3981 } 3982 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3983 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3984 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3985 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3986 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 3987 PetscFunctionReturn(0); 3988 } 3989 3990 #undef __FUNCT__ 3991 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 3992 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 3993 { 3994 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3995 IS iscol=a->col,isrow=a->row; 3996 PetscErrorCode ierr; 3997 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 3998 const PetscInt *r,*c,*rout,*cout; 3999 const MatScalar *aa=a->a,*v; 4000 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4001 const PetscScalar *b; 4002 4003 PetscFunctionBegin; 4004 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4005 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4006 t = a->solve_work; 4007 4008 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4009 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4010 4011 /* forward solve the lower triangular */ 4012 idx = 3*r[0]; 4013 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4014 for (i=1; i<n; i++) { 4015 v = aa + 9*ai[i]; 4016 vi = aj + ai[i]; 4017 nz = ai[i+1] - ai[i]; 4018 idx = 3*r[i]; 4019 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4020 for(m=0;m<nz;m++){ 4021 idx = 3*vi[m]; 4022 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4023 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4024 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4025 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4026 v += 9; 4027 } 4028 idx = 3*i; 4029 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4030 } 4031 /* backward solve the upper triangular */ 4032 for (i=n-1; i>=0; i--){ 4033 v = aa + 9*(adiag[i+1]+1); 4034 vi = aj + adiag[i+1]+1; 4035 nz = adiag[i] - adiag[i+1] - 1; 4036 idt = 3*i; 4037 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4038 for(m=0;m<nz;m++){ 4039 idx = 3*vi[m]; 4040 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4041 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4042 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4043 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4044 v += 9; 4045 } 4046 idc = 3*c[i]; 4047 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4048 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4049 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4050 } 4051 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4052 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4053 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4054 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4055 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4056 PetscFunctionReturn(0); 4057 } 4058 4059 /* 4060 Special case where the matrix was ILU(0) factored in the natural 4061 ordering. This eliminates the need for the column and row permutation. 4062 */ 4063 #undef __FUNCT__ 4064 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4065 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4066 { 4067 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4068 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4069 PetscErrorCode ierr; 4070 PetscInt *diag = a->diag; 4071 const MatScalar *aa=a->a,*v; 4072 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4073 const PetscScalar *b; 4074 PetscInt jdx,idt,idx,nz,*vi,i; 4075 4076 PetscFunctionBegin; 4077 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4078 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4079 4080 /* forward solve the lower triangular */ 4081 idx = 0; 4082 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4083 for (i=1; i<n; i++) { 4084 v = aa + 9*ai[i]; 4085 vi = aj + ai[i]; 4086 nz = diag[i] - ai[i]; 4087 idx += 3; 4088 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4089 while (nz--) { 4090 jdx = 3*(*vi++); 4091 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4092 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4093 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4094 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4095 v += 9; 4096 } 4097 x[idx] = s1; 4098 x[1+idx] = s2; 4099 x[2+idx] = s3; 4100 } 4101 /* backward solve the upper triangular */ 4102 for (i=n-1; i>=0; i--){ 4103 v = aa + 9*diag[i] + 9; 4104 vi = aj + diag[i] + 1; 4105 nz = ai[i+1] - diag[i] - 1; 4106 idt = 3*i; 4107 s1 = x[idt]; s2 = x[1+idt]; 4108 s3 = x[2+idt]; 4109 while (nz--) { 4110 idx = 3*(*vi++); 4111 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4112 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4113 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4114 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4115 v += 9; 4116 } 4117 v = aa + 9*diag[i]; 4118 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4119 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4120 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4121 } 4122 4123 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4124 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4125 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4126 PetscFunctionReturn(0); 4127 } 4128 4129 #undef __FUNCT__ 4130 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4131 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4132 { 4133 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4134 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4135 PetscErrorCode ierr; 4136 PetscInt idx,jdx,idt; 4137 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4138 const MatScalar *aa=a->a,*v; 4139 PetscScalar *x; 4140 const PetscScalar *b; 4141 PetscScalar s1,s2,s3,x1,x2,x3; 4142 4143 PetscFunctionBegin; 4144 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4145 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4146 /* forward solve the lower triangular */ 4147 idx = 0; 4148 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4149 for (i=1; i<n; i++) { 4150 v = aa + bs2*ai[i]; 4151 vi = aj + ai[i]; 4152 nz = ai[i+1] - ai[i]; 4153 idx = bs*i; 4154 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4155 for(k=0;k<nz;k++){ 4156 jdx = bs*vi[k]; 4157 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4158 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4159 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4160 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4161 4162 v += bs2; 4163 } 4164 4165 x[idx] = s1; 4166 x[1+idx] = s2; 4167 x[2+idx] = s3; 4168 } 4169 4170 /* backward solve the upper triangular */ 4171 for (i=n-1; i>=0; i--){ 4172 v = aa + bs2*(adiag[i+1]+1); 4173 vi = aj + adiag[i+1]+1; 4174 nz = adiag[i] - adiag[i+1]-1; 4175 idt = bs*i; 4176 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4177 4178 for(k=0;k<nz;k++){ 4179 idx = bs*vi[k]; 4180 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4181 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4182 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4183 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4184 4185 v += bs2; 4186 } 4187 /* x = inv_diagonal*x */ 4188 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4189 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4190 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4191 4192 } 4193 4194 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4195 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4196 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4197 PetscFunctionReturn(0); 4198 } 4199 4200 #undef __FUNCT__ 4201 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4202 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 4203 { 4204 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4205 IS iscol=a->col,isrow=a->row; 4206 PetscErrorCode ierr; 4207 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4208 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4209 const MatScalar *aa=a->a,*v; 4210 PetscScalar *x,s1,s2,x1,x2,*t; 4211 const PetscScalar *b; 4212 4213 PetscFunctionBegin; 4214 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4215 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4216 t = a->solve_work; 4217 4218 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4219 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4220 4221 /* forward solve the lower triangular */ 4222 idx = 2*(*r++); 4223 t[0] = b[idx]; t[1] = b[1+idx]; 4224 for (i=1; i<n; i++) { 4225 v = aa + 4*ai[i]; 4226 vi = aj + ai[i]; 4227 nz = diag[i] - ai[i]; 4228 idx = 2*(*r++); 4229 s1 = b[idx]; s2 = b[1+idx]; 4230 while (nz--) { 4231 idx = 2*(*vi++); 4232 x1 = t[idx]; x2 = t[1+idx]; 4233 s1 -= v[0]*x1 + v[2]*x2; 4234 s2 -= v[1]*x1 + v[3]*x2; 4235 v += 4; 4236 } 4237 idx = 2*i; 4238 t[idx] = s1; t[1+idx] = s2; 4239 } 4240 /* backward solve the upper triangular */ 4241 for (i=n-1; i>=0; i--){ 4242 v = aa + 4*diag[i] + 4; 4243 vi = aj + diag[i] + 1; 4244 nz = ai[i+1] - diag[i] - 1; 4245 idt = 2*i; 4246 s1 = t[idt]; s2 = t[1+idt]; 4247 while (nz--) { 4248 idx = 2*(*vi++); 4249 x1 = t[idx]; x2 = t[1+idx]; 4250 s1 -= v[0]*x1 + v[2]*x2; 4251 s2 -= v[1]*x1 + v[3]*x2; 4252 v += 4; 4253 } 4254 idc = 2*(*c--); 4255 v = aa + 4*diag[i]; 4256 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4257 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4258 } 4259 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4260 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4261 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4262 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4263 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4264 PetscFunctionReturn(0); 4265 } 4266 4267 #undef __FUNCT__ 4268 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 4269 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 4270 { 4271 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4272 IS iscol=a->col,isrow=a->row; 4273 PetscErrorCode ierr; 4274 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 4275 const PetscInt *r,*c,*rout,*cout; 4276 const MatScalar *aa=a->a,*v; 4277 PetscScalar *x,s1,s2,x1,x2,*t; 4278 const PetscScalar *b; 4279 4280 PetscFunctionBegin; 4281 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4282 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4283 t = a->solve_work; 4284 4285 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4286 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4287 4288 /* forward solve the lower triangular */ 4289 idx = 2*r[0]; 4290 t[0] = b[idx]; t[1] = b[1+idx]; 4291 for (i=1; i<n; i++) { 4292 v = aa + 4*ai[i]; 4293 vi = aj + ai[i]; 4294 nz = ai[i+1] - ai[i]; 4295 idx = 2*r[i]; 4296 s1 = b[idx]; s2 = b[1+idx]; 4297 for(m=0;m<nz;m++){ 4298 jdx = 2*vi[m]; 4299 x1 = t[jdx]; x2 = t[1+jdx]; 4300 s1 -= v[0]*x1 + v[2]*x2; 4301 s2 -= v[1]*x1 + v[3]*x2; 4302 v += 4; 4303 } 4304 idx = 2*i; 4305 t[idx] = s1; t[1+idx] = s2; 4306 } 4307 /* backward solve the upper triangular */ 4308 for (i=n-1; i>=0; i--){ 4309 v = aa + 4*(adiag[i+1]+1); 4310 vi = aj + adiag[i+1]+1; 4311 nz = adiag[i] - adiag[i+1] - 1; 4312 idt = 2*i; 4313 s1 = t[idt]; s2 = t[1+idt]; 4314 for(m=0;m<nz;m++){ 4315 idx = 2*vi[m]; 4316 x1 = t[idx]; x2 = t[1+idx]; 4317 s1 -= v[0]*x1 + v[2]*x2; 4318 s2 -= v[1]*x1 + v[3]*x2; 4319 v += 4; 4320 } 4321 idc = 2*c[i]; 4322 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4323 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4324 } 4325 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4326 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4327 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4328 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4329 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4330 PetscFunctionReturn(0); 4331 } 4332 4333 /* 4334 Special case where the matrix was ILU(0) factored in the natural 4335 ordering. This eliminates the need for the column and row permutation. 4336 */ 4337 #undef __FUNCT__ 4338 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4339 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 4340 { 4341 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4342 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4343 PetscErrorCode ierr; 4344 PetscInt *diag = a->diag; 4345 const MatScalar *aa=a->a,*v; 4346 PetscScalar *x,s1,s2,x1,x2; 4347 const PetscScalar *b; 4348 PetscInt jdx,idt,idx,nz,*vi,i; 4349 4350 PetscFunctionBegin; 4351 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4352 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4353 4354 /* forward solve the lower triangular */ 4355 idx = 0; 4356 x[0] = b[0]; x[1] = b[1]; 4357 for (i=1; i<n; i++) { 4358 v = aa + 4*ai[i]; 4359 vi = aj + ai[i]; 4360 nz = diag[i] - ai[i]; 4361 idx += 2; 4362 s1 = b[idx];s2 = b[1+idx]; 4363 while (nz--) { 4364 jdx = 2*(*vi++); 4365 x1 = x[jdx];x2 = x[1+jdx]; 4366 s1 -= v[0]*x1 + v[2]*x2; 4367 s2 -= v[1]*x1 + v[3]*x2; 4368 v += 4; 4369 } 4370 x[idx] = s1; 4371 x[1+idx] = s2; 4372 } 4373 /* backward solve the upper triangular */ 4374 for (i=n-1; i>=0; i--){ 4375 v = aa + 4*diag[i] + 4; 4376 vi = aj + diag[i] + 1; 4377 nz = ai[i+1] - diag[i] - 1; 4378 idt = 2*i; 4379 s1 = x[idt]; s2 = x[1+idt]; 4380 while (nz--) { 4381 idx = 2*(*vi++); 4382 x1 = x[idx]; x2 = x[1+idx]; 4383 s1 -= v[0]*x1 + v[2]*x2; 4384 s2 -= v[1]*x1 + v[3]*x2; 4385 v += 4; 4386 } 4387 v = aa + 4*diag[i]; 4388 x[idt] = v[0]*s1 + v[2]*s2; 4389 x[1+idt] = v[1]*s1 + v[3]*s2; 4390 } 4391 4392 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4393 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4394 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4395 PetscFunctionReturn(0); 4396 } 4397 4398 #undef __FUNCT__ 4399 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4400 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4401 { 4402 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4403 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4404 PetscErrorCode ierr; 4405 PetscInt jdx; 4406 const MatScalar *aa=a->a,*v; 4407 PetscScalar *x,s1,s2,x1,x2; 4408 const PetscScalar *b; 4409 4410 PetscFunctionBegin; 4411 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4412 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4413 /* forward solve the lower triangular */ 4414 idx = 0; 4415 x[0] = b[idx]; x[1] = b[1+idx]; 4416 for (i=1; i<n; i++) { 4417 v = aa + 4*ai[i]; 4418 vi = aj + ai[i]; 4419 nz = ai[i+1] - ai[i]; 4420 idx = 2*i; 4421 s1 = b[idx];s2 = b[1+idx]; 4422 for(k=0;k<nz;k++){ 4423 jdx = 2*vi[k]; 4424 x1 = x[jdx];x2 = x[1+jdx]; 4425 s1 -= v[0]*x1 + v[2]*x2; 4426 s2 -= v[1]*x1 + v[3]*x2; 4427 v += 4; 4428 } 4429 x[idx] = s1; 4430 x[1+idx] = s2; 4431 } 4432 4433 /* backward solve the upper triangular */ 4434 for (i=n-1; i>=0; i--){ 4435 v = aa + 4*(adiag[i+1]+1); 4436 vi = aj + adiag[i+1]+1; 4437 nz = adiag[i] - adiag[i+1]-1; 4438 idt = 2*i; 4439 s1 = x[idt]; s2 = x[1+idt]; 4440 for(k=0;k<nz;k++){ 4441 idx = 2*vi[k]; 4442 x1 = x[idx]; x2 = x[1+idx]; 4443 s1 -= v[0]*x1 + v[2]*x2; 4444 s2 -= v[1]*x1 + v[3]*x2; 4445 v += 4; 4446 } 4447 /* x = inv_diagonal*x */ 4448 x[idt] = v[0]*s1 + v[2]*s2; 4449 x[1+idt] = v[1]*s1 + v[3]*s2; 4450 } 4451 4452 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4453 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4454 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4455 PetscFunctionReturn(0); 4456 } 4457 4458 #undef __FUNCT__ 4459 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4460 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 4461 { 4462 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4463 IS iscol=a->col,isrow=a->row; 4464 PetscErrorCode ierr; 4465 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4466 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4467 MatScalar *aa=a->a,*v; 4468 PetscScalar *x,*b,s1,*t; 4469 4470 PetscFunctionBegin; 4471 if (!n) PetscFunctionReturn(0); 4472 4473 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4474 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4475 t = a->solve_work; 4476 4477 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4478 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4479 4480 /* forward solve the lower triangular */ 4481 t[0] = b[*r++]; 4482 for (i=1; i<n; i++) { 4483 v = aa + ai[i]; 4484 vi = aj + ai[i]; 4485 nz = diag[i] - ai[i]; 4486 s1 = b[*r++]; 4487 while (nz--) { 4488 s1 -= (*v++)*t[*vi++]; 4489 } 4490 t[i] = s1; 4491 } 4492 /* backward solve the upper triangular */ 4493 for (i=n-1; i>=0; i--){ 4494 v = aa + diag[i] + 1; 4495 vi = aj + diag[i] + 1; 4496 nz = ai[i+1] - diag[i] - 1; 4497 s1 = t[i]; 4498 while (nz--) { 4499 s1 -= (*v++)*t[*vi++]; 4500 } 4501 x[*c--] = t[i] = aa[diag[i]]*s1; 4502 } 4503 4504 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4505 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4506 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4507 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4508 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4509 PetscFunctionReturn(0); 4510 } 4511 /* 4512 Special case where the matrix was ILU(0) factored in the natural 4513 ordering. This eliminates the need for the column and row permutation. 4514 */ 4515 #undef __FUNCT__ 4516 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 4517 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 4518 { 4519 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4520 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4521 PetscErrorCode ierr; 4522 PetscInt *diag = a->diag; 4523 MatScalar *aa=a->a; 4524 PetscScalar *x,*b; 4525 PetscScalar s1,x1; 4526 MatScalar *v; 4527 PetscInt jdx,idt,idx,nz,*vi,i; 4528 4529 PetscFunctionBegin; 4530 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4531 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4532 4533 /* forward solve the lower triangular */ 4534 idx = 0; 4535 x[0] = b[0]; 4536 for (i=1; i<n; i++) { 4537 v = aa + ai[i]; 4538 vi = aj + ai[i]; 4539 nz = diag[i] - ai[i]; 4540 idx += 1; 4541 s1 = b[idx]; 4542 while (nz--) { 4543 jdx = *vi++; 4544 x1 = x[jdx]; 4545 s1 -= v[0]*x1; 4546 v += 1; 4547 } 4548 x[idx] = s1; 4549 } 4550 /* backward solve the upper triangular */ 4551 for (i=n-1; i>=0; i--){ 4552 v = aa + diag[i] + 1; 4553 vi = aj + diag[i] + 1; 4554 nz = ai[i+1] - diag[i] - 1; 4555 idt = i; 4556 s1 = x[idt]; 4557 while (nz--) { 4558 idx = *vi++; 4559 x1 = x[idx]; 4560 s1 -= v[0]*x1; 4561 v += 1; 4562 } 4563 v = aa + diag[i]; 4564 x[idt] = v[0]*s1; 4565 } 4566 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4567 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4568 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 4569 PetscFunctionReturn(0); 4570 } 4571 4572 /* ----------------------------------------------------------------*/ 4573 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 4574 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 4575 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth); 4576 4577 #undef __FUNCT__ 4578 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 4579 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 4580 { 4581 Mat C=B; 4582 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 4583 IS isrow = b->row,isicol = b->icol; 4584 PetscErrorCode ierr; 4585 const PetscInt *r,*ic,*ics; 4586 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 4587 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 4588 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 4589 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 4590 MatScalar *v_work; 4591 PetscTruth col_identity,row_identity,both_identity; 4592 4593 PetscFunctionBegin; 4594 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4595 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4596 4597 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 4598 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 4599 ics = ic; 4600 4601 /* generate work space needed by dense LU factorization */ 4602 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 4603 4604 for (i=0; i<n; i++){ 4605 /* zero rtmp */ 4606 /* L part */ 4607 nz = bi[i+1] - bi[i]; 4608 bjtmp = bj + bi[i]; 4609 for (j=0; j<nz; j++){ 4610 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4611 } 4612 4613 /* U part */ 4614 nz = bdiag[i] - bdiag[i+1]; 4615 bjtmp = bj + bdiag[i+1]+1; 4616 for (j=0; j<nz; j++){ 4617 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4618 } 4619 4620 /* load in initial (unfactored row) */ 4621 nz = ai[r[i]+1] - ai[r[i]]; 4622 ajtmp = aj + ai[r[i]]; 4623 v = aa + bs2*ai[r[i]]; 4624 for (j=0; j<nz; j++) { 4625 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 4626 } 4627 4628 /* elimination */ 4629 bjtmp = bj + bi[i]; 4630 nzL = bi[i+1] - bi[i]; 4631 for(k=0;k < nzL;k++) { 4632 row = bjtmp[k]; 4633 pc = rtmp + bs2*row; 4634 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 4635 if (flg) { 4636 pv = b->a + bs2*bdiag[row]; 4637 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 4638 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 4639 pv = b->a + bs2*(bdiag[row+1]+1); 4640 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 4641 for (j=0; j<nz; j++) { 4642 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 4643 } 4644 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 4645 } 4646 } 4647 4648 /* finished row so stick it into b->a */ 4649 /* L part */ 4650 pv = b->a + bs2*bi[i] ; 4651 pj = b->j + bi[i] ; 4652 nz = bi[i+1] - bi[i]; 4653 for (j=0; j<nz; j++) { 4654 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4655 } 4656 4657 /* Mark diagonal and invert diagonal for simplier triangular solves */ 4658 pv = b->a + bs2*bdiag[i]; 4659 pj = b->j + bdiag[i]; 4660 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 4661 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4662 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 4663 4664 /* U part */ 4665 pv = b->a + bs2*(bdiag[i+1]+1); 4666 pj = b->j + bdiag[i+1]+1; 4667 nz = bdiag[i] - bdiag[i+1] - 1; 4668 for (j=0; j<nz; j++){ 4669 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 4670 } 4671 } 4672 4673 ierr = PetscFree(rtmp);CHKERRQ(ierr); 4674 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 4675 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4676 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4677 4678 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4679 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 4680 both_identity = (PetscTruth) (row_identity && col_identity); 4681 if (both_identity){ 4682 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 4683 } else { 4684 C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 4685 } 4686 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N_newdatastruct; 4687 4688 C->assembled = PETSC_TRUE; 4689 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 4690 PetscFunctionReturn(0); 4691 } 4692 4693 /* 4694 ilu(0) with natural ordering under new data structure. 4695 See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 4696 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 4697 */ 4698 4699 #undef __FUNCT__ 4700 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 4701 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4702 { 4703 4704 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 4705 PetscErrorCode ierr; 4706 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 4707 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 4708 4709 PetscFunctionBegin; 4710 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 4711 b = (Mat_SeqBAIJ*)(fact)->data; 4712 4713 /* allocate matrix arrays for new data structure */ 4714 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 4715 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 4716 b->singlemalloc = PETSC_TRUE; 4717 if (!b->diag){ 4718 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 4719 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 4720 } 4721 bdiag = b->diag; 4722 4723 if (n > 0) { 4724 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 4725 } 4726 4727 /* set bi and bj with new data structure */ 4728 bi = b->i; 4729 bj = b->j; 4730 4731 /* L part */ 4732 bi[0] = 0; 4733 for (i=0; i<n; i++){ 4734 nz = adiag[i] - ai[i]; 4735 bi[i+1] = bi[i] + nz; 4736 aj = a->j + ai[i]; 4737 for (j=0; j<nz; j++){ 4738 *bj = aj[j]; bj++; 4739 } 4740 } 4741 4742 /* U part */ 4743 bi_temp = bi[n]; 4744 bdiag[n] = bi[n]-1; 4745 for (i=n-1; i>=0; i--){ 4746 nz = ai[i+1] - adiag[i] - 1; 4747 bi_temp = bi_temp + nz + 1; 4748 aj = a->j + adiag[i] + 1; 4749 for (j=0; j<nz; j++){ 4750 *bj = aj[j]; bj++; 4751 } 4752 /* diag[i] */ 4753 *bj = i; bj++; 4754 bdiag[i] = bi_temp - 1; 4755 } 4756 PetscFunctionReturn(0); 4757 } 4758 4759 #undef __FUNCT__ 4760 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 4761 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4762 { 4763 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 4764 IS isicol; 4765 PetscErrorCode ierr; 4766 const PetscInt *r,*ic; 4767 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 4768 PetscInt *bi,*cols,nnz,*cols_lvl; 4769 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 4770 PetscInt i,levels,diagonal_fill; 4771 PetscTruth col_identity,row_identity,both_identity; 4772 PetscReal f; 4773 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 4774 PetscBT lnkbt; 4775 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 4776 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 4777 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 4778 PetscTruth missing; 4779 PetscInt bs=A->rmap->bs,bs2=a->bs2; 4780 4781 PetscFunctionBegin; 4782 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 4783 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 4784 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 4785 4786 f = info->fill; 4787 levels = (PetscInt)info->levels; 4788 diagonal_fill = (PetscInt)info->diagonal_fill; 4789 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 4790 4791 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4792 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 4793 both_identity = (PetscTruth) (row_identity && col_identity); 4794 4795 if (!levels && both_identity) { 4796 /* special case: ilu(0) with natural ordering */ 4797 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 4798 ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 4799 4800 fact->factor = MAT_FACTOR_ILU; 4801 (fact)->info.factor_mallocs = 0; 4802 (fact)->info.fill_ratio_given = info->fill; 4803 (fact)->info.fill_ratio_needed = 1.0; 4804 b = (Mat_SeqBAIJ*)(fact)->data; 4805 b->row = isrow; 4806 b->col = iscol; 4807 b->icol = isicol; 4808 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4809 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4810 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 4811 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 4812 PetscFunctionReturn(0); 4813 } 4814 4815 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 4816 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 4817 4818 /* get new row pointers */ 4819 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 4820 bi[0] = 0; 4821 /* bdiag is location of diagonal in factor */ 4822 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 4823 bdiag[0] = 0; 4824 4825 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 4826 4827 /* create a linked list for storing column indices of the active row */ 4828 nlnk = n + 1; 4829 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 4830 4831 /* initial FreeSpace size is f*(ai[n]+1) */ 4832 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 4833 current_space = free_space; 4834 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 4835 current_space_lvl = free_space_lvl; 4836 4837 for (i=0; i<n; i++) { 4838 nzi = 0; 4839 /* copy current row into linked list */ 4840 nnz = ai[r[i]+1] - ai[r[i]]; 4841 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 4842 cols = aj + ai[r[i]]; 4843 lnk[i] = -1; /* marker to indicate if diagonal exists */ 4844 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 4845 nzi += nlnk; 4846 4847 /* make sure diagonal entry is included */ 4848 if (diagonal_fill && lnk[i] == -1) { 4849 fm = n; 4850 while (lnk[fm] < i) fm = lnk[fm]; 4851 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 4852 lnk[fm] = i; 4853 lnk_lvl[i] = 0; 4854 nzi++; dcount++; 4855 } 4856 4857 /* add pivot rows into the active row */ 4858 nzbd = 0; 4859 prow = lnk[n]; 4860 while (prow < i) { 4861 nnz = bdiag[prow]; 4862 cols = bj_ptr[prow] + nnz + 1; 4863 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 4864 nnz = bi[prow+1] - bi[prow] - nnz - 1; 4865 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 4866 nzi += nlnk; 4867 prow = lnk[prow]; 4868 nzbd++; 4869 } 4870 bdiag[i] = nzbd; 4871 bi[i+1] = bi[i] + nzi; 4872 4873 /* if free space is not available, make more free space */ 4874 if (current_space->local_remaining<nzi) { 4875 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 4876 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 4877 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 4878 reallocs++; 4879 } 4880 4881 /* copy data into free_space and free_space_lvl, then initialize lnk */ 4882 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 4883 bj_ptr[i] = current_space->array; 4884 bjlvl_ptr[i] = current_space_lvl->array; 4885 4886 /* make sure the active row i has diagonal entry */ 4887 if (*(bj_ptr[i]+bdiag[i]) != i) { 4888 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 4889 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 4890 } 4891 4892 current_space->array += nzi; 4893 current_space->local_used += nzi; 4894 current_space->local_remaining -= nzi; 4895 current_space_lvl->array += nzi; 4896 current_space_lvl->local_used += nzi; 4897 current_space_lvl->local_remaining -= nzi; 4898 } 4899 4900 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 4901 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 4902 4903 /* destroy list of free space and other temporary arrays */ 4904 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 4905 4906 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 4907 ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 4908 4909 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 4910 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 4911 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 4912 4913 #if defined(PETSC_USE_INFO) 4914 { 4915 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 4916 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 4917 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 4918 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 4919 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 4920 if (diagonal_fill) { 4921 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 4922 } 4923 } 4924 #endif 4925 4926 /* put together the new matrix */ 4927 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 4928 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 4929 b = (Mat_SeqBAIJ*)(fact)->data; 4930 b->free_a = PETSC_TRUE; 4931 b->free_ij = PETSC_TRUE; 4932 b->singlemalloc = PETSC_FALSE; 4933 ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 4934 b->j = bj; 4935 b->i = bi; 4936 b->diag = bdiag; 4937 b->free_diag = PETSC_TRUE; 4938 b->ilen = 0; 4939 b->imax = 0; 4940 b->row = isrow; 4941 b->col = iscol; 4942 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 4943 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 4944 b->icol = isicol; 4945 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 4946 /* In b structure: Free imax, ilen, old a, old j. 4947 Allocate bdiag, solve_work, new a, new j */ 4948 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 4949 b->maxnz = b->nz = bdiag[0]+1; 4950 fact->info.factor_mallocs = reallocs; 4951 fact->info.fill_ratio_given = f; 4952 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 4953 ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 4954 PetscFunctionReturn(0); 4955 } 4956 4957 4958 /* 4959 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 4960 except that the data structure of Mat_SeqAIJ is slightly different. 4961 Not a good example of code reuse. 4962 */ 4963 #undef __FUNCT__ 4964 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 4965 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 4966 { 4967 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 4968 IS isicol; 4969 PetscErrorCode ierr; 4970 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 4971 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 4972 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 4973 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 4974 PetscTruth col_identity,row_identity,both_identity,flg; 4975 PetscReal f; 4976 PetscTruth newdatastruct = PETSC_FALSE; 4977 4978 PetscFunctionBegin; 4979 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 4980 if (newdatastruct){ 4981 ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 4982 PetscFunctionReturn(0); 4983 } 4984 4985 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 4986 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 4987 4988 f = info->fill; 4989 levels = (PetscInt)info->levels; 4990 diagonal_fill = (PetscInt)info->diagonal_fill; 4991 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 4992 4993 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 4994 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 4995 both_identity = (PetscTruth) (row_identity && col_identity); 4996 4997 if (!levels && both_identity) { /* special case copy the nonzero structure */ 4998 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 4999 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5000 5001 fact->factor = MAT_FACTOR_ILU; 5002 b = (Mat_SeqBAIJ*)fact->data; 5003 b->row = isrow; 5004 b->col = iscol; 5005 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5006 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5007 b->icol = isicol; 5008 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5009 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5010 PetscFunctionReturn(0); 5011 } 5012 5013 /* general case perform the symbolic factorization */ 5014 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5015 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5016 5017 /* get new row pointers */ 5018 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 5019 ainew[0] = 0; 5020 /* don't know how many column pointers are needed so estimate */ 5021 jmax = (PetscInt)(f*ai[n] + 1); 5022 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 5023 /* ajfill is level of fill for each fill entry */ 5024 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 5025 /* fill is a linked list of nonzeros in active row */ 5026 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 5027 /* im is level for each filled value */ 5028 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 5029 /* dloc is location of diagonal in factor */ 5030 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 5031 dloc[0] = 0; 5032 for (prow=0; prow<n; prow++) { 5033 5034 /* copy prow into linked list */ 5035 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 5036 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 5037 xi = aj + ai[r[prow]]; 5038 fill[n] = n; 5039 fill[prow] = -1; /* marker for diagonal entry */ 5040 while (nz--) { 5041 fm = n; 5042 idx = ic[*xi++]; 5043 do { 5044 m = fm; 5045 fm = fill[m]; 5046 } while (fm < idx); 5047 fill[m] = idx; 5048 fill[idx] = fm; 5049 im[idx] = 0; 5050 } 5051 5052 /* make sure diagonal entry is included */ 5053 if (diagonal_fill && fill[prow] == -1) { 5054 fm = n; 5055 while (fill[fm] < prow) fm = fill[fm]; 5056 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5057 fill[fm] = prow; 5058 im[prow] = 0; 5059 nzf++; 5060 dcount++; 5061 } 5062 5063 nzi = 0; 5064 row = fill[n]; 5065 while (row < prow) { 5066 incrlev = im[row] + 1; 5067 nz = dloc[row]; 5068 xi = ajnew + ainew[row] + nz + 1; 5069 flev = ajfill + ainew[row] + nz + 1; 5070 nnz = ainew[row+1] - ainew[row] - nz - 1; 5071 fm = row; 5072 while (nnz-- > 0) { 5073 idx = *xi++; 5074 if (*flev + incrlev > levels) { 5075 flev++; 5076 continue; 5077 } 5078 do { 5079 m = fm; 5080 fm = fill[m]; 5081 } while (fm < idx); 5082 if (fm != idx) { 5083 im[idx] = *flev + incrlev; 5084 fill[m] = idx; 5085 fill[idx] = fm; 5086 fm = idx; 5087 nzf++; 5088 } else { 5089 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 5090 } 5091 flev++; 5092 } 5093 row = fill[row]; 5094 nzi++; 5095 } 5096 /* copy new filled row into permanent storage */ 5097 ainew[prow+1] = ainew[prow] + nzf; 5098 if (ainew[prow+1] > jmax) { 5099 5100 /* estimate how much additional space we will need */ 5101 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5102 /* just double the memory each time */ 5103 PetscInt maxadd = jmax; 5104 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 5105 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 5106 jmax += maxadd; 5107 5108 /* allocate a longer ajnew and ajfill */ 5109 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5110 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5111 ierr = PetscFree(ajnew);CHKERRQ(ierr); 5112 ajnew = xitmp; 5113 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5114 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5115 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5116 ajfill = xitmp; 5117 reallocate++; /* count how many reallocations are needed */ 5118 } 5119 xitmp = ajnew + ainew[prow]; 5120 flev = ajfill + ainew[prow]; 5121 dloc[prow] = nzi; 5122 fm = fill[n]; 5123 while (nzf--) { 5124 *xitmp++ = fm; 5125 *flev++ = im[fm]; 5126 fm = fill[fm]; 5127 } 5128 /* make sure row has diagonal entry */ 5129 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 5130 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5131 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5132 } 5133 } 5134 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5135 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5136 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5137 ierr = PetscFree(fill);CHKERRQ(ierr); 5138 ierr = PetscFree(im);CHKERRQ(ierr); 5139 5140 #if defined(PETSC_USE_INFO) 5141 { 5142 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5143 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5144 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5145 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5146 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5147 if (diagonal_fill) { 5148 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5149 } 5150 } 5151 #endif 5152 5153 /* put together the new matrix */ 5154 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5155 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5156 b = (Mat_SeqBAIJ*)fact->data; 5157 b->free_a = PETSC_TRUE; 5158 b->free_ij = PETSC_TRUE; 5159 b->singlemalloc = PETSC_FALSE; 5160 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5161 b->j = ajnew; 5162 b->i = ainew; 5163 for (i=0; i<n; i++) dloc[i] += ainew[i]; 5164 b->diag = dloc; 5165 b->free_diag = PETSC_TRUE; 5166 b->ilen = 0; 5167 b->imax = 0; 5168 b->row = isrow; 5169 b->col = iscol; 5170 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5171 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5172 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5173 b->icol = isicol; 5174 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5175 /* In b structure: Free imax, ilen, old a, old j. 5176 Allocate dloc, solve_work, new a, new j */ 5177 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 5178 b->maxnz = b->nz = ainew[n]; 5179 5180 fact->info.factor_mallocs = reallocate; 5181 fact->info.fill_ratio_given = f; 5182 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 5183 5184 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5185 PetscFunctionReturn(0); 5186 } 5187 5188 #undef __FUNCT__ 5189 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5190 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 5191 { 5192 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 5193 /* int i,*AJ=a->j,nz=a->nz; */ 5194 PetscFunctionBegin; 5195 /* Undo Column scaling */ 5196 /* while (nz--) { */ 5197 /* AJ[i] = AJ[i]/4; */ 5198 /* } */ 5199 /* This should really invoke a push/pop logic, but we don't have that yet. */ 5200 A->ops->setunfactored = PETSC_NULL; 5201 PetscFunctionReturn(0); 5202 } 5203 5204 #undef __FUNCT__ 5205 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5206 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 5207 { 5208 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5209 PetscInt *AJ=a->j,nz=a->nz; 5210 unsigned short *aj=(unsigned short *)AJ; 5211 PetscFunctionBegin; 5212 /* Is this really necessary? */ 5213 while (nz--) { 5214 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 5215 } 5216 A->ops->setunfactored = PETSC_NULL; 5217 PetscFunctionReturn(0); 5218 } 5219 5220 5221