1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 #include "petscbt.h" 11 #include "../src/mat/utils/freespace.h" 12 13 #undef __FUNCT__ 14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 16 { 17 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 18 PetscErrorCode ierr; 19 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 20 PetscInt *diag = a->diag; 21 MatScalar *aa=a->a,*v; 22 PetscScalar s1,*x,*b; 23 24 PetscFunctionBegin; 25 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 26 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 27 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 28 29 /* forward solve the U^T */ 30 for (i=0; i<n; i++) { 31 32 v = aa + diag[i]; 33 /* multiply by the inverse of the block diagonal */ 34 s1 = (*v++)*x[i]; 35 vi = aj + diag[i] + 1; 36 nz = ai[i+1] - diag[i] - 1; 37 while (nz--) { 38 x[*vi++] -= (*v++)*s1; 39 } 40 x[i] = s1; 41 } 42 /* backward solve the L^T */ 43 for (i=n-1; i>=0; i--){ 44 v = aa + diag[i] - 1; 45 vi = aj + diag[i] - 1; 46 nz = diag[i] - ai[i]; 47 s1 = x[i]; 48 while (nz--) { 49 x[*vi--] -= (*v--)*s1; 50 } 51 } 52 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 53 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 54 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 55 PetscFunctionReturn(0); 56 } 57 58 #undef __FUNCT__ 59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 61 { 62 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 63 PetscErrorCode ierr; 64 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 65 PetscInt *diag = a->diag,oidx; 66 MatScalar *aa=a->a,*v; 67 PetscScalar s1,s2,x1,x2; 68 PetscScalar *x,*b; 69 70 PetscFunctionBegin; 71 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 72 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 73 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 74 75 /* forward solve the U^T */ 76 idx = 0; 77 for (i=0; i<n; i++) { 78 79 v = aa + 4*diag[i]; 80 /* multiply by the inverse of the block diagonal */ 81 x1 = x[idx]; x2 = x[1+idx]; 82 s1 = v[0]*x1 + v[1]*x2; 83 s2 = v[2]*x1 + v[3]*x2; 84 v += 4; 85 86 vi = aj + diag[i] + 1; 87 nz = ai[i+1] - diag[i] - 1; 88 while (nz--) { 89 oidx = 2*(*vi++); 90 x[oidx] -= v[0]*s1 + v[1]*s2; 91 x[oidx+1] -= v[2]*s1 + v[3]*s2; 92 v += 4; 93 } 94 x[idx] = s1;x[1+idx] = s2; 95 idx += 2; 96 } 97 /* backward solve the L^T */ 98 for (i=n-1; i>=0; i--){ 99 v = aa + 4*diag[i] - 4; 100 vi = aj + diag[i] - 1; 101 nz = diag[i] - ai[i]; 102 idt = 2*i; 103 s1 = x[idt]; s2 = x[1+idt]; 104 while (nz--) { 105 idx = 2*(*vi--); 106 x[idx] -= v[0]*s1 + v[1]*s2; 107 x[idx+1] -= v[2]*s1 + v[3]*s2; 108 v -= 4; 109 } 110 } 111 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 112 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 113 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 114 PetscFunctionReturn(0); 115 } 116 117 #undef __FUNCT__ 118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct" 119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 120 { 121 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 122 PetscErrorCode ierr; 123 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 124 PetscInt nz,idx,idt,j,i,oidx; 125 PetscInt bs=A->rmap->bs,bs2=a->bs2; 126 MatScalar *aa=a->a,*v; 127 PetscScalar s1,s2,x1,x2; 128 PetscScalar *x,*b; 129 130 PetscFunctionBegin; 131 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 132 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 133 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 134 135 /* forward solve the U^T */ 136 idx = 0; 137 for (i=0; i<n; i++) { 138 v = aa + bs2*diag[i]; 139 /* multiply by the inverse of the block diagonal */ 140 x1 = x[idx]; x2 = x[1+idx]; 141 s1 = v[0]*x1 + v[1]*x2; 142 s2 = v[2]*x1 + v[3]*x2; 143 v -= bs2; 144 145 vi = aj + diag[i] - 1; 146 nz = diag[i] - diag[i+1] - 1; 147 for(j=0;j>-nz;j--){ 148 oidx = bs*vi[j]; 149 x[oidx] -= v[0]*s1 + v[1]*s2; 150 x[oidx+1] -= v[2]*s1 + v[3]*s2; 151 v -= bs2; 152 } 153 x[idx] = s1;x[1+idx] = s2; 154 idx += bs; 155 } 156 /* backward solve the L^T */ 157 for (i=n-1; i>=0; i--){ 158 v = aa + bs2*ai[i]; 159 vi = aj + ai[i]; 160 nz = ai[i+1] - ai[i]; 161 idt = bs*i; 162 s1 = x[idt]; s2 = x[1+idt]; 163 for(j=0;j<nz;j++){ 164 idx = bs*vi[j]; 165 x[idx] -= v[0]*s1 + v[1]*s2; 166 x[idx+1] -= v[2]*s1 + v[3]*s2; 167 v += bs2; 168 } 169 } 170 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 171 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 172 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 173 PetscFunctionReturn(0); 174 } 175 176 #undef __FUNCT__ 177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 179 { 180 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 181 PetscErrorCode ierr; 182 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 183 PetscInt *diag = a->diag,oidx; 184 MatScalar *aa=a->a,*v; 185 PetscScalar s1,s2,s3,x1,x2,x3; 186 PetscScalar *x,*b; 187 188 PetscFunctionBegin; 189 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 190 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 191 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 192 193 /* forward solve the U^T */ 194 idx = 0; 195 for (i=0; i<n; i++) { 196 197 v = aa + 9*diag[i]; 198 /* multiply by the inverse of the block diagonal */ 199 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 200 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 201 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 202 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 203 v += 9; 204 205 vi = aj + diag[i] + 1; 206 nz = ai[i+1] - diag[i] - 1; 207 while (nz--) { 208 oidx = 3*(*vi++); 209 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 210 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 211 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 212 v += 9; 213 } 214 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 215 idx += 3; 216 } 217 /* backward solve the L^T */ 218 for (i=n-1; i>=0; i--){ 219 v = aa + 9*diag[i] - 9; 220 vi = aj + diag[i] - 1; 221 nz = diag[i] - ai[i]; 222 idt = 3*i; 223 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 224 while (nz--) { 225 idx = 3*(*vi--); 226 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 227 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 228 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 229 v -= 9; 230 } 231 } 232 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 233 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 235 PetscFunctionReturn(0); 236 } 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct" 240 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 241 { 242 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 243 PetscErrorCode ierr; 244 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 245 PetscInt nz,idx,idt,j,i,oidx; 246 PetscInt bs=A->rmap->bs,bs2=a->bs2; 247 MatScalar *aa=a->a,*v; 248 PetscScalar s1,s2,s3,x1,x2,x3; 249 PetscScalar *x,*b; 250 251 PetscFunctionBegin; 252 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 253 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 254 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 255 256 /* forward solve the U^T */ 257 idx = 0; 258 for (i=0; i<n; i++) { 259 v = aa + bs2*diag[i]; 260 /* multiply by the inverse of the block diagonal */ 261 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 262 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 263 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 264 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 265 v -= bs2; 266 267 vi = aj + diag[i] - 1; 268 nz = diag[i] - diag[i+1] - 1; 269 for(j=0;j>-nz;j--){ 270 oidx = bs*vi[j]; 271 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 272 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 273 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 274 v -= bs2; 275 } 276 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 277 idx += bs; 278 } 279 /* backward solve the L^T */ 280 for (i=n-1; i>=0; i--){ 281 v = aa + bs2*ai[i]; 282 vi = aj + ai[i]; 283 nz = ai[i+1] - ai[i]; 284 idt = bs*i; 285 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 286 for(j=0;j<nz;j++){ 287 idx = bs*vi[j]; 288 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 289 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 290 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 291 v += bs2; 292 } 293 } 294 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 295 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 296 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 297 PetscFunctionReturn(0); 298 } 299 300 #undef __FUNCT__ 301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 302 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 303 { 304 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 305 PetscErrorCode ierr; 306 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 307 PetscInt *diag = a->diag,oidx; 308 MatScalar *aa=a->a,*v; 309 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 310 PetscScalar *x,*b; 311 312 PetscFunctionBegin; 313 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 314 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 315 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 316 317 /* forward solve the U^T */ 318 idx = 0; 319 for (i=0; i<n; i++) { 320 321 v = aa + 16*diag[i]; 322 /* multiply by the inverse of the block diagonal */ 323 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 324 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 325 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 326 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 327 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 328 v += 16; 329 330 vi = aj + diag[i] + 1; 331 nz = ai[i+1] - diag[i] - 1; 332 while (nz--) { 333 oidx = 4*(*vi++); 334 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 335 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 336 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 337 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 338 v += 16; 339 } 340 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 341 idx += 4; 342 } 343 /* backward solve the L^T */ 344 for (i=n-1; i>=0; i--){ 345 v = aa + 16*diag[i] - 16; 346 vi = aj + diag[i] - 1; 347 nz = diag[i] - ai[i]; 348 idt = 4*i; 349 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 350 while (nz--) { 351 idx = 4*(*vi--); 352 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 353 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 354 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 355 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 356 v -= 16; 357 } 358 } 359 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 360 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 361 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 362 PetscFunctionReturn(0); 363 } 364 365 #undef __FUNCT__ 366 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct" 367 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 368 { 369 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 370 PetscErrorCode ierr; 371 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 372 PetscInt nz,idx,idt,j,i,oidx; 373 PetscInt bs=A->rmap->bs,bs2=a->bs2; 374 MatScalar *aa=a->a,*v; 375 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 376 PetscScalar *x,*b; 377 378 PetscFunctionBegin; 379 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 380 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 381 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 382 383 /* forward solve the U^T */ 384 idx = 0; 385 for (i=0; i<n; i++) { 386 v = aa + bs2*diag[i]; 387 /* multiply by the inverse of the block diagonal */ 388 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 389 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 390 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 391 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 392 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 393 v -= bs2; 394 395 vi = aj + diag[i] - 1; 396 nz = diag[i] - diag[i+1] - 1; 397 for(j=0;j>-nz;j--){ 398 oidx = bs*vi[j]; 399 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 400 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 401 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 402 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 403 v -= bs2; 404 } 405 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; 406 idx += bs; 407 } 408 /* backward solve the L^T */ 409 for (i=n-1; i>=0; i--){ 410 v = aa + bs2*ai[i]; 411 vi = aj + ai[i]; 412 nz = ai[i+1] - ai[i]; 413 idt = bs*i; 414 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; 415 for(j=0;j<nz;j++){ 416 idx = bs*vi[j]; 417 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 418 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 419 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 420 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 421 v += bs2; 422 } 423 } 424 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 425 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 426 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 427 PetscFunctionReturn(0); 428 } 429 430 #undef __FUNCT__ 431 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 432 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 433 { 434 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 435 PetscErrorCode ierr; 436 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 437 PetscInt *diag = a->diag,oidx; 438 MatScalar *aa=a->a,*v; 439 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 440 PetscScalar *x,*b; 441 442 PetscFunctionBegin; 443 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 444 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 445 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 446 447 /* forward solve the U^T */ 448 idx = 0; 449 for (i=0; i<n; i++) { 450 451 v = aa + 25*diag[i]; 452 /* multiply by the inverse of the block diagonal */ 453 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 454 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 455 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 456 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 457 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 458 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 459 v += 25; 460 461 vi = aj + diag[i] + 1; 462 nz = ai[i+1] - diag[i] - 1; 463 while (nz--) { 464 oidx = 5*(*vi++); 465 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 466 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 467 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 468 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 469 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 470 v += 25; 471 } 472 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 473 idx += 5; 474 } 475 /* backward solve the L^T */ 476 for (i=n-1; i>=0; i--){ 477 v = aa + 25*diag[i] - 25; 478 vi = aj + diag[i] - 1; 479 nz = diag[i] - ai[i]; 480 idt = 5*i; 481 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 482 while (nz--) { 483 idx = 5*(*vi--); 484 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 485 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 486 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 487 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 488 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 489 v -= 25; 490 } 491 } 492 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 493 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 494 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 495 PetscFunctionReturn(0); 496 } 497 498 #undef __FUNCT__ 499 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct" 500 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 501 { 502 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 503 PetscErrorCode ierr; 504 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 505 PetscInt nz,idx,idt,j,i,oidx; 506 PetscInt bs=A->rmap->bs,bs2=a->bs2; 507 MatScalar *aa=a->a,*v; 508 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 509 PetscScalar *x,*b; 510 511 PetscFunctionBegin; 512 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 513 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 514 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 515 516 /* forward solve the U^T */ 517 idx = 0; 518 for (i=0; i<n; i++) { 519 v = aa + bs2*diag[i]; 520 /* multiply by the inverse of the block diagonal */ 521 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 522 x5 = x[4+idx]; 523 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 524 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 525 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 526 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 527 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 528 v -= bs2; 529 530 vi = aj + diag[i] - 1; 531 nz = diag[i] - diag[i+1] - 1; 532 for(j=0;j>-nz;j--){ 533 oidx = bs*vi[j]; 534 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 535 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 536 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 537 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 538 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 539 v -= bs2; 540 } 541 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 542 idx += bs; 543 } 544 /* backward solve the L^T */ 545 for (i=n-1; i>=0; i--){ 546 v = aa + bs2*ai[i]; 547 vi = aj + ai[i]; 548 nz = ai[i+1] - ai[i]; 549 idt = bs*i; 550 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 551 for(j=0;j<nz;j++){ 552 idx = bs*vi[j]; 553 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 554 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 555 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 556 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 557 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 558 v += bs2; 559 } 560 } 561 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 562 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 563 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 564 PetscFunctionReturn(0); 565 } 566 567 #undef __FUNCT__ 568 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 569 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 570 { 571 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 572 PetscErrorCode ierr; 573 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 574 PetscInt *diag = a->diag,oidx; 575 MatScalar *aa=a->a,*v; 576 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 577 PetscScalar *x,*b; 578 579 PetscFunctionBegin; 580 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 581 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 582 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 583 584 /* forward solve the U^T */ 585 idx = 0; 586 for (i=0; i<n; i++) { 587 588 v = aa + 36*diag[i]; 589 /* multiply by the inverse of the block diagonal */ 590 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 591 x6 = x[5+idx]; 592 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 593 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 594 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 595 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 596 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 597 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 598 v += 36; 599 600 vi = aj + diag[i] + 1; 601 nz = ai[i+1] - diag[i] - 1; 602 while (nz--) { 603 oidx = 6*(*vi++); 604 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 605 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 606 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 607 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 608 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 609 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 610 v += 36; 611 } 612 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 613 x[5+idx] = s6; 614 idx += 6; 615 } 616 /* backward solve the L^T */ 617 for (i=n-1; i>=0; i--){ 618 v = aa + 36*diag[i] - 36; 619 vi = aj + diag[i] - 1; 620 nz = diag[i] - ai[i]; 621 idt = 6*i; 622 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 623 s6 = x[5+idt]; 624 while (nz--) { 625 idx = 6*(*vi--); 626 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 627 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 628 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 629 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 630 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 631 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 632 v -= 36; 633 } 634 } 635 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 636 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 637 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 638 PetscFunctionReturn(0); 639 } 640 641 #undef __FUNCT__ 642 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct" 643 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 644 { 645 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 646 PetscErrorCode ierr; 647 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 648 PetscInt nz,idx,idt,j,i,oidx; 649 PetscInt bs=A->rmap->bs,bs2=a->bs2; 650 MatScalar *aa=a->a,*v; 651 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 652 PetscScalar *x,*b; 653 654 PetscFunctionBegin; 655 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 656 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 657 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 658 659 /* forward solve the U^T */ 660 idx = 0; 661 for (i=0; i<n; i++) { 662 v = aa + bs2*diag[i]; 663 /* multiply by the inverse of the block diagonal */ 664 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 665 x5 = x[4+idx]; x6 = x[5+idx]; 666 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 667 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 668 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 669 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 670 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 671 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 672 v -= bs2; 673 674 vi = aj + diag[i] - 1; 675 nz = diag[i] - diag[i+1] - 1; 676 for(j=0;j>-nz;j--){ 677 oidx = bs*vi[j]; 678 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 679 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 680 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 681 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 682 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 683 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 684 v -= bs2; 685 } 686 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 687 x[5+idx] = s6; 688 idx += bs; 689 } 690 /* backward solve the L^T */ 691 for (i=n-1; i>=0; i--){ 692 v = aa + bs2*ai[i]; 693 vi = aj + ai[i]; 694 nz = ai[i+1] - ai[i]; 695 idt = bs*i; 696 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 697 s6 = x[5+idt]; 698 for(j=0;j<nz;j++){ 699 idx = bs*vi[j]; 700 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 701 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 702 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 703 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 704 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 705 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 706 v += bs2; 707 } 708 } 709 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 710 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 711 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 712 PetscFunctionReturn(0); 713 } 714 715 #undef __FUNCT__ 716 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 717 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 718 { 719 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 720 PetscErrorCode ierr; 721 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 722 PetscInt *diag = a->diag,oidx; 723 MatScalar *aa=a->a,*v; 724 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 725 PetscScalar *x,*b; 726 727 PetscFunctionBegin; 728 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 729 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 730 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 731 732 /* forward solve the U^T */ 733 idx = 0; 734 for (i=0; i<n; i++) { 735 736 v = aa + 49*diag[i]; 737 /* multiply by the inverse of the block diagonal */ 738 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 739 x6 = x[5+idx]; x7 = x[6+idx]; 740 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 741 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 742 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 743 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 744 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 745 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 746 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 747 v += 49; 748 749 vi = aj + diag[i] + 1; 750 nz = ai[i+1] - diag[i] - 1; 751 while (nz--) { 752 oidx = 7*(*vi++); 753 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 754 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 755 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 756 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 757 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 758 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 759 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 760 v += 49; 761 } 762 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 763 x[5+idx] = s6;x[6+idx] = s7; 764 idx += 7; 765 } 766 /* backward solve the L^T */ 767 for (i=n-1; i>=0; i--){ 768 v = aa + 49*diag[i] - 49; 769 vi = aj + diag[i] - 1; 770 nz = diag[i] - ai[i]; 771 idt = 7*i; 772 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 773 s6 = x[5+idt];s7 = x[6+idt]; 774 while (nz--) { 775 idx = 7*(*vi--); 776 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 777 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 778 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 779 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 780 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 781 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 782 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 783 v -= 49; 784 } 785 } 786 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 787 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 788 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 789 PetscFunctionReturn(0); 790 } 791 #undef __FUNCT__ 792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct" 793 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 794 { 795 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 796 PetscErrorCode ierr; 797 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 798 PetscInt nz,idx,idt,j,i,oidx; 799 PetscInt bs=A->rmap->bs,bs2=a->bs2; 800 MatScalar *aa=a->a,*v; 801 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 802 PetscScalar *x,*b; 803 804 PetscFunctionBegin; 805 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 806 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 807 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 808 809 /* forward solve the U^T */ 810 idx = 0; 811 for (i=0; i<n; i++) { 812 v = aa + bs2*diag[i]; 813 /* multiply by the inverse of the block diagonal */ 814 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 815 x5 = x[4+idx]; x6 = x[5+idx]; x7 = x[6+idx]; 816 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 817 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 818 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 819 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 820 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 821 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 822 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 823 v -= bs2; 824 vi = aj + diag[i] - 1; 825 nz = diag[i] - diag[i+1] - 1; 826 for(j=0;j>-nz;j--){ 827 oidx = bs*vi[j]; 828 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 829 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 830 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 831 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 832 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 833 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 834 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 835 v -= bs2; 836 } 837 x[idx] = s1; x[1+idx] = s2; x[2+idx] = s3; x[3+idx] = s4; x[4+idx] = s5; 838 x[5+idx] = s6; x[6+idx] = s7; 839 idx += bs; 840 } 841 /* backward solve the L^T */ 842 for (i=n-1; i>=0; i--){ 843 v = aa + bs2*ai[i]; 844 vi = aj + ai[i]; 845 nz = ai[i+1] - ai[i]; 846 idt = bs*i; 847 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; s4 = x[3+idt]; s5 = x[4+idt]; 848 s6 = x[5+idt]; s7 = x[6+idt]; 849 for(j=0;j<nz;j++){ 850 idx = bs*vi[j]; 851 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 852 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 853 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 854 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 855 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 856 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 857 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 858 v += bs2; 859 } 860 } 861 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 862 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 863 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 864 PetscFunctionReturn(0); 865 } 866 867 /*---------------------------------------------------------------------------------------------*/ 868 #undef __FUNCT__ 869 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 870 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 871 { 872 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 873 IS iscol=a->col,isrow=a->row; 874 PetscErrorCode ierr; 875 const PetscInt *r,*c,*rout,*cout; 876 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 877 PetscInt *diag = a->diag; 878 MatScalar *aa=a->a,*v; 879 PetscScalar s1,*x,*b,*t; 880 881 PetscFunctionBegin; 882 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 883 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 884 t = a->solve_work; 885 886 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 887 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 888 889 /* copy the b into temp work space according to permutation */ 890 for (i=0; i<n; i++) { 891 t[i] = b[c[i]]; 892 } 893 894 /* forward solve the U^T */ 895 for (i=0; i<n; i++) { 896 897 v = aa + diag[i]; 898 /* multiply by the inverse of the block diagonal */ 899 s1 = (*v++)*t[i]; 900 vi = aj + diag[i] + 1; 901 nz = ai[i+1] - diag[i] - 1; 902 while (nz--) { 903 t[*vi++] -= (*v++)*s1; 904 } 905 t[i] = s1; 906 } 907 /* backward solve the L^T */ 908 for (i=n-1; i>=0; i--){ 909 v = aa + diag[i] - 1; 910 vi = aj + diag[i] - 1; 911 nz = diag[i] - ai[i]; 912 s1 = t[i]; 913 while (nz--) { 914 t[*vi--] -= (*v--)*s1; 915 } 916 } 917 918 /* copy t into x according to permutation */ 919 for (i=0; i<n; i++) { 920 x[r[i]] = t[i]; 921 } 922 923 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 924 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 925 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 926 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 927 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 928 PetscFunctionReturn(0); 929 } 930 931 #undef __FUNCT__ 932 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 933 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 934 { 935 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 936 IS iscol=a->col,isrow=a->row; 937 PetscErrorCode ierr; 938 const PetscInt *r,*c,*rout,*cout; 939 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 940 PetscInt *diag = a->diag,ii,ic,ir,oidx; 941 MatScalar *aa=a->a,*v; 942 PetscScalar s1,s2,x1,x2; 943 PetscScalar *x,*b,*t; 944 945 PetscFunctionBegin; 946 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 947 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 948 t = a->solve_work; 949 950 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 951 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 952 953 /* copy the b into temp work space according to permutation */ 954 ii = 0; 955 for (i=0; i<n; i++) { 956 ic = 2*c[i]; 957 t[ii] = b[ic]; 958 t[ii+1] = b[ic+1]; 959 ii += 2; 960 } 961 962 /* forward solve the U^T */ 963 idx = 0; 964 for (i=0; i<n; i++) { 965 966 v = aa + 4*diag[i]; 967 /* multiply by the inverse of the block diagonal */ 968 x1 = t[idx]; x2 = t[1+idx]; 969 s1 = v[0]*x1 + v[1]*x2; 970 s2 = v[2]*x1 + v[3]*x2; 971 v += 4; 972 973 vi = aj + diag[i] + 1; 974 nz = ai[i+1] - diag[i] - 1; 975 while (nz--) { 976 oidx = 2*(*vi++); 977 t[oidx] -= v[0]*s1 + v[1]*s2; 978 t[oidx+1] -= v[2]*s1 + v[3]*s2; 979 v += 4; 980 } 981 t[idx] = s1;t[1+idx] = s2; 982 idx += 2; 983 } 984 /* backward solve the L^T */ 985 for (i=n-1; i>=0; i--){ 986 v = aa + 4*diag[i] - 4; 987 vi = aj + diag[i] - 1; 988 nz = diag[i] - ai[i]; 989 idt = 2*i; 990 s1 = t[idt]; s2 = t[1+idt]; 991 while (nz--) { 992 idx = 2*(*vi--); 993 t[idx] -= v[0]*s1 + v[1]*s2; 994 t[idx+1] -= v[2]*s1 + v[3]*s2; 995 v -= 4; 996 } 997 } 998 999 /* copy t into x according to permutation */ 1000 ii = 0; 1001 for (i=0; i<n; i++) { 1002 ir = 2*r[i]; 1003 x[ir] = t[ii]; 1004 x[ir+1] = t[ii+1]; 1005 ii += 2; 1006 } 1007 1008 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1009 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1010 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1011 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1012 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 1013 PetscFunctionReturn(0); 1014 } 1015 1016 #undef __FUNCT__ 1017 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_newdatastruct" 1018 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 1019 { 1020 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1021 PetscErrorCode ierr; 1022 IS iscol=a->col,isrow=a->row; 1023 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1024 const PetscInt *r,*c,*rout,*cout; 1025 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1026 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1027 MatScalar *aa=a->a,*v; 1028 PetscScalar s1,s2,x1,x2; 1029 PetscScalar *x,*b,*t; 1030 1031 PetscFunctionBegin; 1032 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1033 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1034 t = a->solve_work; 1035 1036 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1037 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1038 1039 /* copy b into temp work space according to permutation */ 1040 for(i=0;i<n;i++){ 1041 ii = bs*i; ic = bs*c[i]; 1042 t[ii] = b[ic]; t[ii+1] = b[ic+1]; 1043 } 1044 1045 /* forward solve the U^T */ 1046 idx = 0; 1047 for (i=0; i<n; i++) { 1048 v = aa + bs2*diag[i]; 1049 /* multiply by the inverse of the block diagonal */ 1050 x1 = t[idx]; x2 = t[1+idx]; 1051 s1 = v[0]*x1 + v[1]*x2; 1052 s2 = v[2]*x1 + v[3]*x2; 1053 v -= bs2; 1054 1055 vi = aj + diag[i] - 1; 1056 nz = diag[i] - diag[i+1] - 1; 1057 for(j=0;j>-nz;j--){ 1058 oidx = bs*vi[j]; 1059 t[oidx] -= v[0]*s1 + v[1]*s2; 1060 t[oidx+1] -= v[2]*s1 + v[3]*s2; 1061 v -= bs2; 1062 } 1063 t[idx] = s1;t[1+idx] = s2; 1064 idx += bs; 1065 } 1066 /* backward solve the L^T */ 1067 for (i=n-1; i>=0; i--){ 1068 v = aa + bs2*ai[i]; 1069 vi = aj + ai[i]; 1070 nz = ai[i+1] - ai[i]; 1071 idt = bs*i; 1072 s1 = t[idt]; s2 = t[1+idt]; 1073 for(j=0;j<nz;j++){ 1074 idx = bs*vi[j]; 1075 t[idx] -= v[0]*s1 + v[1]*s2; 1076 t[idx+1] -= v[2]*s1 + v[3]*s2; 1077 v += bs2; 1078 } 1079 } 1080 1081 /* copy t into x according to permutation */ 1082 for(i=0;i<n;i++){ 1083 ii = bs*i; ir = bs*r[i]; 1084 x[ir] = t[ii]; x[ir+1] = t[ii+1]; 1085 } 1086 1087 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1088 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1089 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1090 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1091 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1092 PetscFunctionReturn(0); 1093 } 1094 1095 #undef __FUNCT__ 1096 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 1097 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 1098 { 1099 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1100 IS iscol=a->col,isrow=a->row; 1101 PetscErrorCode ierr; 1102 const PetscInt *r,*c,*rout,*cout; 1103 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1104 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1105 MatScalar *aa=a->a,*v; 1106 PetscScalar s1,s2,s3,x1,x2,x3; 1107 PetscScalar *x,*b,*t; 1108 1109 PetscFunctionBegin; 1110 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1111 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1112 t = a->solve_work; 1113 1114 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1115 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1116 1117 /* copy the b into temp work space according to permutation */ 1118 ii = 0; 1119 for (i=0; i<n; i++) { 1120 ic = 3*c[i]; 1121 t[ii] = b[ic]; 1122 t[ii+1] = b[ic+1]; 1123 t[ii+2] = b[ic+2]; 1124 ii += 3; 1125 } 1126 1127 /* forward solve the U^T */ 1128 idx = 0; 1129 for (i=0; i<n; i++) { 1130 1131 v = aa + 9*diag[i]; 1132 /* multiply by the inverse of the block diagonal */ 1133 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1134 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1135 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1136 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1137 v += 9; 1138 1139 vi = aj + diag[i] + 1; 1140 nz = ai[i+1] - diag[i] - 1; 1141 while (nz--) { 1142 oidx = 3*(*vi++); 1143 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1144 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1145 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1146 v += 9; 1147 } 1148 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1149 idx += 3; 1150 } 1151 /* backward solve the L^T */ 1152 for (i=n-1; i>=0; i--){ 1153 v = aa + 9*diag[i] - 9; 1154 vi = aj + diag[i] - 1; 1155 nz = diag[i] - ai[i]; 1156 idt = 3*i; 1157 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1158 while (nz--) { 1159 idx = 3*(*vi--); 1160 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1161 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1162 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1163 v -= 9; 1164 } 1165 } 1166 1167 /* copy t into x according to permutation */ 1168 ii = 0; 1169 for (i=0; i<n; i++) { 1170 ir = 3*r[i]; 1171 x[ir] = t[ii]; 1172 x[ir+1] = t[ii+1]; 1173 x[ir+2] = t[ii+2]; 1174 ii += 3; 1175 } 1176 1177 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1178 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1179 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1180 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1181 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 1182 PetscFunctionReturn(0); 1183 } 1184 1185 #undef __FUNCT__ 1186 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_newdatastruct" 1187 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 1188 { 1189 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1190 PetscErrorCode ierr; 1191 IS iscol=a->col,isrow=a->row; 1192 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1193 const PetscInt *r,*c,*rout,*cout; 1194 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1195 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1196 MatScalar *aa=a->a,*v; 1197 PetscScalar s1,s2,s3,x1,x2,x3; 1198 PetscScalar *x,*b,*t; 1199 1200 PetscFunctionBegin; 1201 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1202 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1203 t = a->solve_work; 1204 1205 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1206 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1207 1208 /* copy b into temp work space according to permutation */ 1209 for(i=0;i<n;i++){ 1210 ii = bs*i; ic = bs*c[i]; 1211 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; 1212 } 1213 1214 /* forward solve the U^T */ 1215 idx = 0; 1216 for (i=0; i<n; i++) { 1217 v = aa + bs2*diag[i]; 1218 /* multiply by the inverse of the block diagonal */ 1219 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1220 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 1221 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 1222 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 1223 v -= bs2; 1224 1225 vi = aj + diag[i] - 1; 1226 nz = diag[i] - diag[i+1] - 1; 1227 for(j=0;j>-nz;j--){ 1228 oidx = bs*vi[j]; 1229 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1230 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1231 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1232 v -= bs2; 1233 } 1234 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 1235 idx += bs; 1236 } 1237 /* backward solve the L^T */ 1238 for (i=n-1; i>=0; i--){ 1239 v = aa + bs2*ai[i]; 1240 vi = aj + ai[i]; 1241 nz = ai[i+1] - ai[i]; 1242 idt = bs*i; 1243 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1244 for(j=0;j<nz;j++){ 1245 idx = bs*vi[j]; 1246 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 1247 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 1248 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 1249 v += bs2; 1250 } 1251 } 1252 1253 /* copy t into x according to permutation */ 1254 for(i=0;i<n;i++){ 1255 ii = bs*i; ir = bs*r[i]; 1256 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; 1257 } 1258 1259 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1260 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1261 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1262 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1263 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1264 PetscFunctionReturn(0); 1265 } 1266 1267 #undef __FUNCT__ 1268 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 1269 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1270 { 1271 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1272 IS iscol=a->col,isrow=a->row; 1273 PetscErrorCode ierr; 1274 const PetscInt *r,*c,*rout,*cout; 1275 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1276 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1277 MatScalar *aa=a->a,*v; 1278 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 1279 PetscScalar *x,*b,*t; 1280 1281 PetscFunctionBegin; 1282 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1283 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1284 t = a->solve_work; 1285 1286 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1287 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1288 1289 /* copy the b into temp work space according to permutation */ 1290 ii = 0; 1291 for (i=0; i<n; i++) { 1292 ic = 4*c[i]; 1293 t[ii] = b[ic]; 1294 t[ii+1] = b[ic+1]; 1295 t[ii+2] = b[ic+2]; 1296 t[ii+3] = b[ic+3]; 1297 ii += 4; 1298 } 1299 1300 /* forward solve the U^T */ 1301 idx = 0; 1302 for (i=0; i<n; i++) { 1303 1304 v = aa + 16*diag[i]; 1305 /* multiply by the inverse of the block diagonal */ 1306 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1307 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1308 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1309 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1310 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1311 v += 16; 1312 1313 vi = aj + diag[i] + 1; 1314 nz = ai[i+1] - diag[i] - 1; 1315 while (nz--) { 1316 oidx = 4*(*vi++); 1317 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1318 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1319 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1320 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1321 v += 16; 1322 } 1323 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 1324 idx += 4; 1325 } 1326 /* backward solve the L^T */ 1327 for (i=n-1; i>=0; i--){ 1328 v = aa + 16*diag[i] - 16; 1329 vi = aj + diag[i] - 1; 1330 nz = diag[i] - ai[i]; 1331 idt = 4*i; 1332 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 1333 while (nz--) { 1334 idx = 4*(*vi--); 1335 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1336 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1337 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1338 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1339 v -= 16; 1340 } 1341 } 1342 1343 /* copy t into x according to permutation */ 1344 ii = 0; 1345 for (i=0; i<n; i++) { 1346 ir = 4*r[i]; 1347 x[ir] = t[ii]; 1348 x[ir+1] = t[ii+1]; 1349 x[ir+2] = t[ii+2]; 1350 x[ir+3] = t[ii+3]; 1351 ii += 4; 1352 } 1353 1354 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1355 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1356 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1357 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1358 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1359 PetscFunctionReturn(0); 1360 } 1361 1362 #undef __FUNCT__ 1363 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_newdatastruct" 1364 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 1365 { 1366 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1367 PetscErrorCode ierr; 1368 IS iscol=a->col,isrow=a->row; 1369 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1370 const PetscInt *r,*c,*rout,*cout; 1371 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1372 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1373 MatScalar *aa=a->a,*v; 1374 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 1375 PetscScalar *x,*b,*t; 1376 1377 PetscFunctionBegin; 1378 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1379 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1380 t = a->solve_work; 1381 1382 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1383 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1384 1385 /* copy b into temp work space according to permutation */ 1386 for(i=0;i<n;i++){ 1387 ii = bs*i; ic = bs*c[i]; 1388 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1389 } 1390 1391 /* forward solve the U^T */ 1392 idx = 0; 1393 for (i=0; i<n; i++) { 1394 v = aa + bs2*diag[i]; 1395 /* multiply by the inverse of the block diagonal */ 1396 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 1397 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 1398 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 1399 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 1400 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 1401 v -= bs2; 1402 1403 vi = aj + diag[i] - 1; 1404 nz = diag[i] - diag[i+1] - 1; 1405 for(j=0;j>-nz;j--){ 1406 oidx = bs*vi[j]; 1407 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1408 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1409 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1410 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1411 v -= bs2; 1412 } 1413 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; 1414 idx += bs; 1415 } 1416 /* backward solve the L^T */ 1417 for (i=n-1; i>=0; i--){ 1418 v = aa + bs2*ai[i]; 1419 vi = aj + ai[i]; 1420 nz = ai[i+1] - ai[i]; 1421 idt = bs*i; 1422 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; 1423 for(j=0;j<nz;j++){ 1424 idx = bs*vi[j]; 1425 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 1426 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 1427 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 1428 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 1429 v += bs2; 1430 } 1431 } 1432 1433 /* copy t into x according to permutation */ 1434 for(i=0;i<n;i++){ 1435 ii = bs*i; ir = bs*r[i]; 1436 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1437 } 1438 1439 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1440 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1441 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1442 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1443 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1444 PetscFunctionReturn(0); 1445 } 1446 1447 #undef __FUNCT__ 1448 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 1449 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1450 { 1451 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1452 IS iscol=a->col,isrow=a->row; 1453 PetscErrorCode ierr; 1454 const PetscInt *r,*c,*rout,*cout; 1455 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1456 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1457 MatScalar *aa=a->a,*v; 1458 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1459 PetscScalar *x,*b,*t; 1460 1461 PetscFunctionBegin; 1462 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1463 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1464 t = a->solve_work; 1465 1466 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1467 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1468 1469 /* copy the b into temp work space according to permutation */ 1470 ii = 0; 1471 for (i=0; i<n; i++) { 1472 ic = 5*c[i]; 1473 t[ii] = b[ic]; 1474 t[ii+1] = b[ic+1]; 1475 t[ii+2] = b[ic+2]; 1476 t[ii+3] = b[ic+3]; 1477 t[ii+4] = b[ic+4]; 1478 ii += 5; 1479 } 1480 1481 /* forward solve the U^T */ 1482 idx = 0; 1483 for (i=0; i<n; i++) { 1484 1485 v = aa + 25*diag[i]; 1486 /* multiply by the inverse of the block diagonal */ 1487 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1488 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1489 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1490 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1491 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1492 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1493 v += 25; 1494 1495 vi = aj + diag[i] + 1; 1496 nz = ai[i+1] - diag[i] - 1; 1497 while (nz--) { 1498 oidx = 5*(*vi++); 1499 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1500 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1501 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1502 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1503 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1504 v += 25; 1505 } 1506 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1507 idx += 5; 1508 } 1509 /* backward solve the L^T */ 1510 for (i=n-1; i>=0; i--){ 1511 v = aa + 25*diag[i] - 25; 1512 vi = aj + diag[i] - 1; 1513 nz = diag[i] - ai[i]; 1514 idt = 5*i; 1515 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1516 while (nz--) { 1517 idx = 5*(*vi--); 1518 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1519 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1520 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1521 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1522 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1523 v -= 25; 1524 } 1525 } 1526 1527 /* copy t into x according to permutation */ 1528 ii = 0; 1529 for (i=0; i<n; i++) { 1530 ir = 5*r[i]; 1531 x[ir] = t[ii]; 1532 x[ir+1] = t[ii+1]; 1533 x[ir+2] = t[ii+2]; 1534 x[ir+3] = t[ii+3]; 1535 x[ir+4] = t[ii+4]; 1536 ii += 5; 1537 } 1538 1539 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1540 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1541 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1542 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1543 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1544 PetscFunctionReturn(0); 1545 } 1546 1547 #undef __FUNCT__ 1548 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_newdatastruct" 1549 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 1550 { 1551 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1552 PetscErrorCode ierr; 1553 IS iscol=a->col,isrow=a->row; 1554 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1555 const PetscInt *r,*c,*rout,*cout; 1556 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1557 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1558 MatScalar *aa=a->a,*v; 1559 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1560 PetscScalar *x,*b,*t; 1561 1562 PetscFunctionBegin; 1563 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1564 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1565 t = a->solve_work; 1566 1567 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1568 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1569 1570 /* copy b into temp work space according to permutation */ 1571 for(i=0;i<n;i++){ 1572 ii = bs*i; ic = bs*c[i]; 1573 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1574 t[ii+4] = b[ic+4]; 1575 } 1576 1577 /* forward solve the U^T */ 1578 idx = 0; 1579 for (i=0; i<n; i++) { 1580 v = aa + bs2*diag[i]; 1581 /* multiply by the inverse of the block diagonal */ 1582 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1583 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 1584 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 1585 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 1586 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 1587 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 1588 v -= bs2; 1589 1590 vi = aj + diag[i] - 1; 1591 nz = diag[i] - diag[i+1] - 1; 1592 for(j=0;j>-nz;j--){ 1593 oidx = bs*vi[j]; 1594 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1595 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1596 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1597 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1598 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1599 v -= bs2; 1600 } 1601 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1602 idx += bs; 1603 } 1604 /* backward solve the L^T */ 1605 for (i=n-1; i>=0; i--){ 1606 v = aa + bs2*ai[i]; 1607 vi = aj + ai[i]; 1608 nz = ai[i+1] - ai[i]; 1609 idt = bs*i; 1610 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1611 for(j=0;j<nz;j++){ 1612 idx = bs*vi[j]; 1613 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 1614 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 1615 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 1616 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 1617 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 1618 v += bs2; 1619 } 1620 } 1621 1622 /* copy t into x according to permutation */ 1623 for(i=0;i<n;i++){ 1624 ii = bs*i; ir = bs*r[i]; 1625 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1626 x[ir+4] = t[ii+4]; 1627 } 1628 1629 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1630 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1631 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1632 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1633 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1634 PetscFunctionReturn(0); 1635 } 1636 1637 #undef __FUNCT__ 1638 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 1639 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1640 { 1641 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1642 IS iscol=a->col,isrow=a->row; 1643 PetscErrorCode ierr; 1644 const PetscInt *r,*c,*rout,*cout; 1645 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1646 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1647 MatScalar *aa=a->a,*v; 1648 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1649 PetscScalar *x,*b,*t; 1650 1651 PetscFunctionBegin; 1652 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1653 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1654 t = a->solve_work; 1655 1656 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1657 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1658 1659 /* copy the b into temp work space according to permutation */ 1660 ii = 0; 1661 for (i=0; i<n; i++) { 1662 ic = 6*c[i]; 1663 t[ii] = b[ic]; 1664 t[ii+1] = b[ic+1]; 1665 t[ii+2] = b[ic+2]; 1666 t[ii+3] = b[ic+3]; 1667 t[ii+4] = b[ic+4]; 1668 t[ii+5] = b[ic+5]; 1669 ii += 6; 1670 } 1671 1672 /* forward solve the U^T */ 1673 idx = 0; 1674 for (i=0; i<n; i++) { 1675 1676 v = aa + 36*diag[i]; 1677 /* multiply by the inverse of the block diagonal */ 1678 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1679 x6 = t[5+idx]; 1680 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1681 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1682 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1683 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1684 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1685 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1686 v += 36; 1687 1688 vi = aj + diag[i] + 1; 1689 nz = ai[i+1] - diag[i] - 1; 1690 while (nz--) { 1691 oidx = 6*(*vi++); 1692 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1693 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1694 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1695 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1696 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1697 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1698 v += 36; 1699 } 1700 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1701 t[5+idx] = s6; 1702 idx += 6; 1703 } 1704 /* backward solve the L^T */ 1705 for (i=n-1; i>=0; i--){ 1706 v = aa + 36*diag[i] - 36; 1707 vi = aj + diag[i] - 1; 1708 nz = diag[i] - ai[i]; 1709 idt = 6*i; 1710 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1711 s6 = t[5+idt]; 1712 while (nz--) { 1713 idx = 6*(*vi--); 1714 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1715 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1716 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1717 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1718 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1719 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1720 v -= 36; 1721 } 1722 } 1723 1724 /* copy t into x according to permutation */ 1725 ii = 0; 1726 for (i=0; i<n; i++) { 1727 ir = 6*r[i]; 1728 x[ir] = t[ii]; 1729 x[ir+1] = t[ii+1]; 1730 x[ir+2] = t[ii+2]; 1731 x[ir+3] = t[ii+3]; 1732 x[ir+4] = t[ii+4]; 1733 x[ir+5] = t[ii+5]; 1734 ii += 6; 1735 } 1736 1737 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1738 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1739 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1740 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1741 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1742 PetscFunctionReturn(0); 1743 } 1744 1745 #undef __FUNCT__ 1746 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_newdatastruct" 1747 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 1748 { 1749 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1750 PetscErrorCode ierr; 1751 IS iscol=a->col,isrow=a->row; 1752 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1753 const PetscInt *r,*c,*rout,*cout; 1754 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1755 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1756 MatScalar *aa=a->a,*v; 1757 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1758 PetscScalar *x,*b,*t; 1759 1760 PetscFunctionBegin; 1761 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1762 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1763 t = a->solve_work; 1764 1765 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1766 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1767 1768 /* copy b into temp work space according to permutation */ 1769 for(i=0;i<n;i++){ 1770 ii = bs*i; ic = bs*c[i]; 1771 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1772 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; 1773 } 1774 1775 /* forward solve the U^T */ 1776 idx = 0; 1777 for (i=0; i<n; i++) { 1778 v = aa + bs2*diag[i]; 1779 /* multiply by the inverse of the block diagonal */ 1780 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1781 x6 = t[5+idx]; 1782 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 1783 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 1784 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 1785 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1786 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1787 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1788 v -= bs2; 1789 1790 vi = aj + diag[i] - 1; 1791 nz = diag[i] - diag[i+1] - 1; 1792 for(j=0;j>-nz;j--){ 1793 oidx = bs*vi[j]; 1794 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1795 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1796 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1797 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1798 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1799 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1800 v -= bs2; 1801 } 1802 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 1803 t[5+idx] = s6; 1804 idx += bs; 1805 } 1806 /* backward solve the L^T */ 1807 for (i=n-1; i>=0; i--){ 1808 v = aa + bs2*ai[i]; 1809 vi = aj + ai[i]; 1810 nz = ai[i+1] - ai[i]; 1811 idt = bs*i; 1812 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 1813 s6 = t[5+idt]; 1814 for(j=0;j<nz;j++){ 1815 idx = bs*vi[j]; 1816 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1817 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1818 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1819 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1820 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1821 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1822 v += bs2; 1823 } 1824 } 1825 1826 /* copy t into x according to permutation */ 1827 for(i=0;i<n;i++){ 1828 ii = bs*i; ir = bs*r[i]; 1829 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 1830 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; 1831 } 1832 1833 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1834 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1835 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1836 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1837 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 1838 PetscFunctionReturn(0); 1839 } 1840 1841 #undef __FUNCT__ 1842 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1843 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1844 { 1845 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1846 IS iscol=a->col,isrow=a->row; 1847 PetscErrorCode ierr; 1848 const PetscInt *r,*c,*rout,*cout; 1849 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1850 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1851 MatScalar *aa=a->a,*v; 1852 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1853 PetscScalar *x,*b,*t; 1854 1855 PetscFunctionBegin; 1856 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1857 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1858 t = a->solve_work; 1859 1860 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1861 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1862 1863 /* copy the b into temp work space according to permutation */ 1864 ii = 0; 1865 for (i=0; i<n; i++) { 1866 ic = 7*c[i]; 1867 t[ii] = b[ic]; 1868 t[ii+1] = b[ic+1]; 1869 t[ii+2] = b[ic+2]; 1870 t[ii+3] = b[ic+3]; 1871 t[ii+4] = b[ic+4]; 1872 t[ii+5] = b[ic+5]; 1873 t[ii+6] = b[ic+6]; 1874 ii += 7; 1875 } 1876 1877 /* forward solve the U^T */ 1878 idx = 0; 1879 for (i=0; i<n; i++) { 1880 1881 v = aa + 49*diag[i]; 1882 /* multiply by the inverse of the block diagonal */ 1883 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1884 x6 = t[5+idx]; x7 = t[6+idx]; 1885 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1886 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1887 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1888 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1889 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1890 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1891 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1892 v += 49; 1893 1894 vi = aj + diag[i] + 1; 1895 nz = ai[i+1] - diag[i] - 1; 1896 while (nz--) { 1897 oidx = 7*(*vi++); 1898 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1899 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1900 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1901 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1902 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1903 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1904 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1905 v += 49; 1906 } 1907 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1908 t[5+idx] = s6;t[6+idx] = s7; 1909 idx += 7; 1910 } 1911 /* backward solve the L^T */ 1912 for (i=n-1; i>=0; i--){ 1913 v = aa + 49*diag[i] - 49; 1914 vi = aj + diag[i] - 1; 1915 nz = diag[i] - ai[i]; 1916 idt = 7*i; 1917 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1918 s6 = t[5+idt];s7 = t[6+idt]; 1919 while (nz--) { 1920 idx = 7*(*vi--); 1921 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1922 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1923 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1924 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1925 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1926 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1927 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1928 v -= 49; 1929 } 1930 } 1931 1932 /* copy t into x according to permutation */ 1933 ii = 0; 1934 for (i=0; i<n; i++) { 1935 ir = 7*r[i]; 1936 x[ir] = t[ii]; 1937 x[ir+1] = t[ii+1]; 1938 x[ir+2] = t[ii+2]; 1939 x[ir+3] = t[ii+3]; 1940 x[ir+4] = t[ii+4]; 1941 x[ir+5] = t[ii+5]; 1942 x[ir+6] = t[ii+6]; 1943 ii += 7; 1944 } 1945 1946 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1947 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1948 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1949 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1950 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1951 PetscFunctionReturn(0); 1952 } 1953 #undef __FUNCT__ 1954 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_newdatastruct" 1955 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 1956 { 1957 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1958 PetscErrorCode ierr; 1959 IS iscol=a->col,isrow=a->row; 1960 PetscInt n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag; 1961 const PetscInt *r,*c,*rout,*cout; 1962 PetscInt nz,idx,idt,j,i,oidx,ii,ic,ir; 1963 PetscInt bs=A->rmap->bs,bs2=a->bs2; 1964 MatScalar *aa=a->a,*v; 1965 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1966 PetscScalar *x,*b,*t; 1967 1968 PetscFunctionBegin; 1969 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1970 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1971 t = a->solve_work; 1972 1973 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1974 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1975 1976 /* copy b into temp work space according to permutation */ 1977 for(i=0;i<n;i++){ 1978 ii = bs*i; ic = bs*c[i]; 1979 t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3]; 1980 t[ii+4] = b[ic+4]; t[ii+5] = b[ic+5]; t[ii+6] = b[ic+6]; 1981 } 1982 1983 /* forward solve the U^T */ 1984 idx = 0; 1985 for (i=0; i<n; i++) { 1986 v = aa + bs2*diag[i]; 1987 /* multiply by the inverse of the block diagonal */ 1988 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1989 x6 = t[5+idx]; x7 = t[6+idx]; 1990 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1991 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1992 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1993 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1994 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1995 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1996 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1997 v -= bs2; 1998 1999 vi = aj + diag[i] - 1; 2000 nz = diag[i] - diag[i+1] - 1; 2001 for(j=0;j>-nz;j--){ 2002 oidx = bs*vi[j]; 2003 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2004 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2005 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2006 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2007 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2008 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2009 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2010 v -= bs2; 2011 } 2012 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; t[3+idx] = s4; t[4+idx] =s5; 2013 t[5+idx] = s6; t[6+idx] = s7; 2014 idx += bs; 2015 } 2016 /* backward solve the L^T */ 2017 for (i=n-1; i>=0; i--){ 2018 v = aa + bs2*ai[i]; 2019 vi = aj + ai[i]; 2020 nz = ai[i+1] - ai[i]; 2021 idt = bs*i; 2022 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; s4 = t[3+idt]; s5 = t[4+idt]; 2023 s6 = t[5+idt]; s7 = t[6+idt]; 2024 for(j=0;j<nz;j++){ 2025 idx = bs*vi[j]; 2026 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 2027 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 2028 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 2029 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 2030 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 2031 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 2032 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 2033 v += bs2; 2034 } 2035 } 2036 2037 /* copy t into x according to permutation */ 2038 for(i=0;i<n;i++){ 2039 ii = bs*i; ir = bs*r[i]; 2040 x[ir] = t[ii]; x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2]; x[ir+3] = t[ii+3]; 2041 x[ir+4] = t[ii+4]; x[ir+5] = t[ii+5]; x[ir+6] = t[ii+6]; 2042 } 2043 2044 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2045 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2046 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2047 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2048 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2049 PetscFunctionReturn(0); 2050 } 2051 2052 /* ----------------------------------------------------------- */ 2053 #undef __FUNCT__ 2054 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 2055 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 2056 { 2057 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2058 IS iscol=a->col,isrow=a->row; 2059 PetscErrorCode ierr; 2060 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2061 PetscInt i,n=a->mbs; 2062 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 2063 MatScalar *aa=a->a,*v; 2064 PetscScalar *x,*b,*s,*t,*ls; 2065 2066 PetscFunctionBegin; 2067 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2068 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2069 t = a->solve_work; 2070 2071 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2072 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2073 2074 /* forward solve the lower triangular */ 2075 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2076 for (i=1; i<n; i++) { 2077 v = aa + bs2*ai[i]; 2078 vi = aj + ai[i]; 2079 nz = a->diag[i] - ai[i]; 2080 s = t + bs*i; 2081 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 2082 while (nz--) { 2083 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 2084 v += bs2; 2085 } 2086 } 2087 /* backward solve the upper triangular */ 2088 ls = a->solve_work + A->cmap->n; 2089 for (i=n-1; i>=0; i--){ 2090 v = aa + bs2*(a->diag[i] + 1); 2091 vi = aj + a->diag[i] + 1; 2092 nz = ai[i+1] - a->diag[i] - 1; 2093 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2094 while (nz--) { 2095 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 2096 v += bs2; 2097 } 2098 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2099 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2100 } 2101 2102 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2103 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2104 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2105 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2106 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2107 PetscFunctionReturn(0); 2108 } 2109 2110 /* ----------------------------------------------------------- */ 2111 #undef __FUNCT__ 2112 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N" 2113 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 2114 { 2115 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2116 IS iscol=a->col,isrow=a->row; 2117 PetscErrorCode ierr; 2118 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 2119 PetscInt i,n=a->mbs,j; 2120 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 2121 const MatScalar *aa=a->a,*v; 2122 PetscScalar *x,*t,*ls; 2123 const PetscScalar *b; 2124 PetscFunctionBegin; 2125 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2126 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2127 t = a->solve_work; 2128 2129 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2130 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2131 2132 /* copy the b into temp work space according to permutation */ 2133 for (i=0; i<n; i++) { 2134 for (j=0; j<bs; j++) { 2135 t[i*bs+j] = b[c[i]*bs+j]; 2136 } 2137 } 2138 2139 2140 /* forward solve the upper triangular transpose */ 2141 ls = a->solve_work + A->cmap->n; 2142 for (i=0; i<n; i++){ 2143 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2144 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 2145 v = aa + bs2*(a->diag[i] + 1); 2146 vi = aj + a->diag[i] + 1; 2147 nz = ai[i+1] - a->diag[i] - 1; 2148 while (nz--) { 2149 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2150 v += bs2; 2151 } 2152 } 2153 2154 /* backward solve the lower triangular transpose */ 2155 for (i=n-1; i>=0; i--) { 2156 v = aa + bs2*ai[i]; 2157 vi = aj + ai[i]; 2158 nz = a->diag[i] - ai[i]; 2159 while (nz--) { 2160 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs); 2161 v += bs2; 2162 } 2163 } 2164 2165 /* copy t into x according to permutation */ 2166 for (i=0; i<n; i++) { 2167 for (j=0; j<bs; j++) { 2168 x[bs*r[i]+j] = t[bs*i+j]; 2169 } 2170 } 2171 2172 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2173 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2174 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2175 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2176 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2177 PetscFunctionReturn(0); 2178 } 2179 2180 #undef __FUNCT__ 2181 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_newdatastruct" 2182 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_newdatastruct(Mat A,Vec bb,Vec xx) 2183 { 2184 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2185 IS iscol=a->col,isrow=a->row; 2186 PetscErrorCode ierr; 2187 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag; 2188 PetscInt i,n=a->mbs,j; 2189 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 2190 const MatScalar *aa=a->a,*v; 2191 PetscScalar *x,*t,*ls; 2192 const PetscScalar *b; 2193 PetscFunctionBegin; 2194 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2195 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2196 t = a->solve_work; 2197 2198 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2199 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2200 2201 /* copy the b into temp work space according to permutation */ 2202 for (i=0; i<n; i++) { 2203 for (j=0; j<bs; j++) { 2204 t[i*bs+j] = b[c[i]*bs+j]; 2205 } 2206 } 2207 2208 2209 /* forward solve the upper triangular transpose */ 2210 ls = a->solve_work + A->cmap->n; 2211 for (i=0; i<n; i++){ 2212 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 2213 Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs); 2214 v = aa + bs2*(diag[i] - 1); 2215 vi = aj + diag[i] - 1; 2216 nz = diag[i] - diag[i+1] - 1; 2217 for(j=0;j>-nz;j--){ 2218 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2219 v -= bs2; 2220 } 2221 } 2222 2223 /* backward solve the lower triangular transpose */ 2224 for (i=n-1; i>=0; i--) { 2225 v = aa + bs2*ai[i]; 2226 vi = aj + ai[i]; 2227 nz = ai[i+1] - ai[i]; 2228 for(j=0;j<nz;j++){ 2229 Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs); 2230 v += bs2; 2231 } 2232 } 2233 2234 /* copy t into x according to permutation */ 2235 for (i=0; i<n; i++) { 2236 for (j=0; j<bs; j++) { 2237 x[bs*r[i]+j] = t[bs*i+j]; 2238 } 2239 } 2240 2241 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2242 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2243 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2244 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2245 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 2246 PetscFunctionReturn(0); 2247 } 2248 2249 #undef __FUNCT__ 2250 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 2251 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 2252 { 2253 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2254 IS iscol=a->col,isrow=a->row; 2255 PetscErrorCode ierr; 2256 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 2257 PetscInt i,n=a->mbs,nz,idx,idt,idc; 2258 MatScalar *aa=a->a,*v; 2259 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2260 PetscScalar *x,*b,*t; 2261 2262 PetscFunctionBegin; 2263 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2264 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2265 t = a->solve_work; 2266 2267 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2268 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2269 2270 /* forward solve the lower triangular */ 2271 idx = 7*(*r++); 2272 t[0] = b[idx]; t[1] = b[1+idx]; 2273 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2274 t[5] = b[5+idx]; t[6] = b[6+idx]; 2275 2276 for (i=1; i<n; i++) { 2277 v = aa + 49*ai[i]; 2278 vi = aj + ai[i]; 2279 nz = diag[i] - ai[i]; 2280 idx = 7*(*r++); 2281 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2282 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2283 while (nz--) { 2284 idx = 7*(*vi++); 2285 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2286 x4 = t[3+idx];x5 = t[4+idx]; 2287 x6 = t[5+idx];x7 = t[6+idx]; 2288 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2289 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2290 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2291 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2292 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2293 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2294 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2295 v += 49; 2296 } 2297 idx = 7*i; 2298 t[idx] = s1;t[1+idx] = s2; 2299 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2300 t[5+idx] = s6;t[6+idx] = s7; 2301 } 2302 /* backward solve the upper triangular */ 2303 for (i=n-1; i>=0; i--){ 2304 v = aa + 49*diag[i] + 49; 2305 vi = aj + diag[i] + 1; 2306 nz = ai[i+1] - diag[i] - 1; 2307 idt = 7*i; 2308 s1 = t[idt]; s2 = t[1+idt]; 2309 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2310 s6 = t[5+idt];s7 = t[6+idt]; 2311 while (nz--) { 2312 idx = 7*(*vi++); 2313 x1 = t[idx]; x2 = t[1+idx]; 2314 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2315 x6 = t[5+idx]; x7 = t[6+idx]; 2316 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2317 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2318 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2319 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2320 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2321 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2322 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2323 v += 49; 2324 } 2325 idc = 7*(*c--); 2326 v = aa + 49*diag[i]; 2327 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2328 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2329 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2330 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2331 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2332 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2333 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2334 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2335 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2336 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2337 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2338 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2339 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2340 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2341 } 2342 2343 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2344 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2345 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2346 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2347 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2348 PetscFunctionReturn(0); 2349 } 2350 2351 #undef __FUNCT__ 2352 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct" 2353 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx) 2354 { 2355 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2356 IS iscol=a->col,isrow=a->row; 2357 PetscErrorCode ierr; 2358 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi; 2359 PetscInt i,n=a->mbs,nz,idx,idt,idc,m; 2360 MatScalar *aa=a->a,*v; 2361 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2362 PetscScalar *x,*b,*t; 2363 2364 PetscFunctionBegin; 2365 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2366 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2367 t = a->solve_work; 2368 2369 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2370 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2371 2372 /* forward solve the lower triangular */ 2373 idx = 7*r[0]; 2374 t[0] = b[idx]; t[1] = b[1+idx]; 2375 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 2376 t[5] = b[5+idx]; t[6] = b[6+idx]; 2377 2378 for (i=1; i<n; i++) { 2379 v = aa + 49*ai[i]; 2380 vi = aj + ai[i]; 2381 nz = ai[i+1] - ai[i]; 2382 idx = 7*r[i]; 2383 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2384 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2385 for(m=0;m<nz;m++){ 2386 idx = 7*vi[m]; 2387 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 2388 x4 = t[3+idx];x5 = t[4+idx]; 2389 x6 = t[5+idx];x7 = t[6+idx]; 2390 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2391 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2392 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2393 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2394 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2395 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2396 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2397 v += 49; 2398 } 2399 idx = 7*i; 2400 t[idx] = s1;t[1+idx] = s2; 2401 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 2402 t[5+idx] = s6;t[6+idx] = s7; 2403 } 2404 /* backward solve the upper triangular */ 2405 for (i=n-1; i>=0; i--){ 2406 v = aa + 49*(adiag[i+1]+1); 2407 vi = aj + adiag[i+1]+1; 2408 nz = adiag[i] - adiag[i+1] - 1; 2409 idt = 7*i; 2410 s1 = t[idt]; s2 = t[1+idt]; 2411 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 2412 s6 = t[5+idt];s7 = t[6+idt]; 2413 for(m=0;m<nz;m++){ 2414 idx = 7*vi[m]; 2415 x1 = t[idx]; x2 = t[1+idx]; 2416 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 2417 x6 = t[5+idx]; x7 = t[6+idx]; 2418 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2419 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2420 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2421 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2422 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2423 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2424 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2425 v += 49; 2426 } 2427 idc = 7*c[i]; 2428 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 2429 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 2430 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 2431 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 2432 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 2433 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 2434 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 2435 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 2436 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 2437 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 2438 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 2439 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 2440 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 2441 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 2442 } 2443 2444 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2445 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2446 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2447 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2448 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 2449 PetscFunctionReturn(0); 2450 } 2451 2452 #undef __FUNCT__ 2453 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 2454 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 2455 { 2456 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2457 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2458 PetscErrorCode ierr; 2459 PetscInt *diag = a->diag,jdx; 2460 const MatScalar *aa=a->a,*v; 2461 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2462 const PetscScalar *b; 2463 2464 PetscFunctionBegin; 2465 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2466 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2467 /* forward solve the lower triangular */ 2468 idx = 0; 2469 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2470 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2471 x[6] = b[6+idx]; 2472 for (i=1; i<n; i++) { 2473 v = aa + 49*ai[i]; 2474 vi = aj + ai[i]; 2475 nz = diag[i] - ai[i]; 2476 idx = 7*i; 2477 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2478 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2479 s7 = b[6+idx]; 2480 while (nz--) { 2481 jdx = 7*(*vi++); 2482 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2483 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2484 x7 = x[6+jdx]; 2485 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2486 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2487 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2488 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2489 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2490 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2491 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2492 v += 49; 2493 } 2494 x[idx] = s1; 2495 x[1+idx] = s2; 2496 x[2+idx] = s3; 2497 x[3+idx] = s4; 2498 x[4+idx] = s5; 2499 x[5+idx] = s6; 2500 x[6+idx] = s7; 2501 } 2502 /* backward solve the upper triangular */ 2503 for (i=n-1; i>=0; i--){ 2504 v = aa + 49*diag[i] + 49; 2505 vi = aj + diag[i] + 1; 2506 nz = ai[i+1] - diag[i] - 1; 2507 idt = 7*i; 2508 s1 = x[idt]; s2 = x[1+idt]; 2509 s3 = x[2+idt]; s4 = x[3+idt]; 2510 s5 = x[4+idt]; s6 = x[5+idt]; 2511 s7 = x[6+idt]; 2512 while (nz--) { 2513 idx = 7*(*vi++); 2514 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2515 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2516 x7 = x[6+idx]; 2517 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2518 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2519 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2520 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2521 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2522 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2523 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2524 v += 49; 2525 } 2526 v = aa + 49*diag[i]; 2527 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 2528 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2529 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 2530 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2531 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 2532 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2533 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 2534 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2535 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 2536 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2537 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 2538 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2539 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 2540 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2541 } 2542 2543 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2544 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2545 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2546 PetscFunctionReturn(0); 2547 } 2548 2549 #undef __FUNCT__ 2550 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct" 2551 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2552 { 2553 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2554 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2555 PetscErrorCode ierr; 2556 PetscInt idx,jdx,idt; 2557 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2558 const MatScalar *aa=a->a,*v; 2559 PetscScalar *x; 2560 const PetscScalar *b; 2561 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 2562 2563 PetscFunctionBegin; 2564 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2565 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2566 /* forward solve the lower triangular */ 2567 idx = 0; 2568 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2569 x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx]; 2570 for (i=1; i<n; i++) { 2571 v = aa + bs2*ai[i]; 2572 vi = aj + ai[i]; 2573 nz = ai[i+1] - ai[i]; 2574 idx = bs*i; 2575 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2576 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 2577 for(k=0;k<nz;k++) { 2578 jdx = bs*vi[k]; 2579 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2580 x5 = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx]; 2581 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2582 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2583 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2584 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2585 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2586 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2587 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2588 v += bs2; 2589 } 2590 2591 x[idx] = s1; 2592 x[1+idx] = s2; 2593 x[2+idx] = s3; 2594 x[3+idx] = s4; 2595 x[4+idx] = s5; 2596 x[5+idx] = s6; 2597 x[6+idx] = s7; 2598 } 2599 2600 /* backward solve the upper triangular */ 2601 for (i=n-1; i>=0; i--){ 2602 v = aa + bs2*(adiag[i+1]+1); 2603 vi = aj + adiag[i+1]+1; 2604 nz = adiag[i] - adiag[i+1]-1; 2605 idt = bs*i; 2606 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2607 s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt]; 2608 for(k=0;k<nz;k++) { 2609 idx = bs*vi[k]; 2610 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2611 x5 = x[4+idx];x6 = x[5+idx];x7 = x[6+idx]; 2612 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 2613 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 2614 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 2615 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 2616 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 2617 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 2618 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 2619 v += bs2; 2620 } 2621 /* x = inv_diagonal*x */ 2622 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 + v[28]*s5 + v[35]*s6 + v[42]*s7; 2623 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 + v[29]*s5 + v[36]*s6 + v[43]*s7; 2624 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 + v[30]*s5 + v[37]*s6 + v[44]*s7; 2625 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 + v[31]*s5 + v[38]*s6 + v[45]*s7; 2626 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 + v[32]*s5 + v[39]*s6 + v[46]*s7; 2627 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 + v[33]*s5 + v[40]*s6 + v[47]*s7; 2628 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 + v[34]*s5 + v[41]*s6 + v[48]*s7; 2629 } 2630 2631 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2632 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2633 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2634 PetscFunctionReturn(0); 2635 } 2636 2637 #undef __FUNCT__ 2638 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 2639 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 2640 { 2641 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2642 IS iscol=a->col,isrow=a->row; 2643 PetscErrorCode ierr; 2644 const PetscInt *r,*c,*rout,*cout; 2645 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2646 const MatScalar *aa=a->a,*v; 2647 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2648 const PetscScalar *b; 2649 PetscFunctionBegin; 2650 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2651 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2652 t = a->solve_work; 2653 2654 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2655 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2656 2657 /* forward solve the lower triangular */ 2658 idx = 6*(*r++); 2659 t[0] = b[idx]; t[1] = b[1+idx]; 2660 t[2] = b[2+idx]; t[3] = b[3+idx]; 2661 t[4] = b[4+idx]; t[5] = b[5+idx]; 2662 for (i=1; i<n; i++) { 2663 v = aa + 36*ai[i]; 2664 vi = aj + ai[i]; 2665 nz = diag[i] - ai[i]; 2666 idx = 6*(*r++); 2667 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2668 s5 = b[4+idx]; s6 = b[5+idx]; 2669 while (nz--) { 2670 idx = 6*(*vi++); 2671 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2672 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2673 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2674 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2675 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2676 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2677 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2678 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2679 v += 36; 2680 } 2681 idx = 6*i; 2682 t[idx] = s1;t[1+idx] = s2; 2683 t[2+idx] = s3;t[3+idx] = s4; 2684 t[4+idx] = s5;t[5+idx] = s6; 2685 } 2686 /* backward solve the upper triangular */ 2687 for (i=n-1; i>=0; i--){ 2688 v = aa + 36*diag[i] + 36; 2689 vi = aj + diag[i] + 1; 2690 nz = ai[i+1] - diag[i] - 1; 2691 idt = 6*i; 2692 s1 = t[idt]; s2 = t[1+idt]; 2693 s3 = t[2+idt];s4 = t[3+idt]; 2694 s5 = t[4+idt];s6 = t[5+idt]; 2695 while (nz--) { 2696 idx = 6*(*vi++); 2697 x1 = t[idx]; x2 = t[1+idx]; 2698 x3 = t[2+idx]; x4 = t[3+idx]; 2699 x5 = t[4+idx]; x6 = t[5+idx]; 2700 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2701 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2702 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2703 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2704 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2705 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2706 v += 36; 2707 } 2708 idc = 6*(*c--); 2709 v = aa + 36*diag[i]; 2710 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2711 v[18]*s4+v[24]*s5+v[30]*s6; 2712 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2713 v[19]*s4+v[25]*s5+v[31]*s6; 2714 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2715 v[20]*s4+v[26]*s5+v[32]*s6; 2716 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2717 v[21]*s4+v[27]*s5+v[33]*s6; 2718 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2719 v[22]*s4+v[28]*s5+v[34]*s6; 2720 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2721 v[23]*s4+v[29]*s5+v[35]*s6; 2722 } 2723 2724 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2725 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2726 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2727 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2728 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2729 PetscFunctionReturn(0); 2730 } 2731 2732 #undef __FUNCT__ 2733 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct" 2734 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx) 2735 { 2736 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2737 IS iscol=a->col,isrow=a->row; 2738 PetscErrorCode ierr; 2739 const PetscInt *r,*c,*rout,*cout; 2740 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 2741 const MatScalar *aa=a->a,*v; 2742 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 2743 const PetscScalar *b; 2744 PetscFunctionBegin; 2745 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2746 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2747 t = a->solve_work; 2748 2749 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2750 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 2751 2752 /* forward solve the lower triangular */ 2753 idx = 6*r[0]; 2754 t[0] = b[idx]; t[1] = b[1+idx]; 2755 t[2] = b[2+idx]; t[3] = b[3+idx]; 2756 t[4] = b[4+idx]; t[5] = b[5+idx]; 2757 for (i=1; i<n; i++) { 2758 v = aa + 36*ai[i]; 2759 vi = aj + ai[i]; 2760 nz = ai[i+1] - ai[i]; 2761 idx = 6*r[i]; 2762 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2763 s5 = b[4+idx]; s6 = b[5+idx]; 2764 for(m=0;m<nz;m++){ 2765 idx = 6*vi[m]; 2766 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2767 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 2768 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2769 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2770 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2771 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2772 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2773 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2774 v += 36; 2775 } 2776 idx = 6*i; 2777 t[idx] = s1;t[1+idx] = s2; 2778 t[2+idx] = s3;t[3+idx] = s4; 2779 t[4+idx] = s5;t[5+idx] = s6; 2780 } 2781 /* backward solve the upper triangular */ 2782 for (i=n-1; i>=0; i--){ 2783 v = aa + 36*(adiag[i+1]+1); 2784 vi = aj + adiag[i+1]+1; 2785 nz = adiag[i] - adiag[i+1] - 1; 2786 idt = 6*i; 2787 s1 = t[idt]; s2 = t[1+idt]; 2788 s3 = t[2+idt];s4 = t[3+idt]; 2789 s5 = t[4+idt];s6 = t[5+idt]; 2790 for(m=0;m<nz;m++){ 2791 idx = 6*vi[m]; 2792 x1 = t[idx]; x2 = t[1+idx]; 2793 x3 = t[2+idx]; x4 = t[3+idx]; 2794 x5 = t[4+idx]; x6 = t[5+idx]; 2795 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2796 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2797 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2798 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2799 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2800 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2801 v += 36; 2802 } 2803 idc = 6*c[i]; 2804 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 2805 v[18]*s4+v[24]*s5+v[30]*s6; 2806 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 2807 v[19]*s4+v[25]*s5+v[31]*s6; 2808 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 2809 v[20]*s4+v[26]*s5+v[32]*s6; 2810 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 2811 v[21]*s4+v[27]*s5+v[33]*s6; 2812 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 2813 v[22]*s4+v[28]*s5+v[34]*s6; 2814 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 2815 v[23]*s4+v[29]*s5+v[35]*s6; 2816 } 2817 2818 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2819 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2820 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2821 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2822 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2823 PetscFunctionReturn(0); 2824 } 2825 2826 #undef __FUNCT__ 2827 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 2828 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 2829 { 2830 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2831 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 2832 PetscErrorCode ierr; 2833 PetscInt *diag = a->diag,jdx; 2834 const MatScalar *aa=a->a,*v; 2835 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2836 const PetscScalar *b; 2837 2838 PetscFunctionBegin; 2839 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2840 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2841 /* forward solve the lower triangular */ 2842 idx = 0; 2843 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 2844 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 2845 for (i=1; i<n; i++) { 2846 v = aa + 36*ai[i]; 2847 vi = aj + ai[i]; 2848 nz = diag[i] - ai[i]; 2849 idx = 6*i; 2850 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2851 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 2852 while (nz--) { 2853 jdx = 6*(*vi++); 2854 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 2855 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 2856 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2857 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2858 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2859 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2860 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2861 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2862 v += 36; 2863 } 2864 x[idx] = s1; 2865 x[1+idx] = s2; 2866 x[2+idx] = s3; 2867 x[3+idx] = s4; 2868 x[4+idx] = s5; 2869 x[5+idx] = s6; 2870 } 2871 /* backward solve the upper triangular */ 2872 for (i=n-1; i>=0; i--){ 2873 v = aa + 36*diag[i] + 36; 2874 vi = aj + diag[i] + 1; 2875 nz = ai[i+1] - diag[i] - 1; 2876 idt = 6*i; 2877 s1 = x[idt]; s2 = x[1+idt]; 2878 s3 = x[2+idt]; s4 = x[3+idt]; 2879 s5 = x[4+idt]; s6 = x[5+idt]; 2880 while (nz--) { 2881 idx = 6*(*vi++); 2882 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 2883 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 2884 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2885 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 2886 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2887 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2888 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2889 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2890 v += 36; 2891 } 2892 v = aa + 36*diag[i]; 2893 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2894 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2895 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2896 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2897 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2898 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2899 } 2900 2901 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2902 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2903 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 2904 PetscFunctionReturn(0); 2905 } 2906 2907 #undef __FUNCT__ 2908 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct" 2909 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 2910 { 2911 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2912 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 2913 PetscErrorCode ierr; 2914 PetscInt idx,jdx,idt; 2915 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 2916 const MatScalar *aa=a->a,*v; 2917 PetscScalar *x; 2918 const PetscScalar *b; 2919 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 2920 2921 PetscFunctionBegin; 2922 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2923 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2924 /* forward solve the lower triangular */ 2925 idx = 0; 2926 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 2927 x[4] = b[4+idx];x[5] = b[5+idx]; 2928 for (i=1; i<n; i++) { 2929 v = aa + bs2*ai[i]; 2930 vi = aj + ai[i]; 2931 nz = ai[i+1] - ai[i]; 2932 idx = bs*i; 2933 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2934 s5 = b[4+idx];s6 = b[5+idx]; 2935 for(k=0;k<nz;k++){ 2936 jdx = bs*vi[k]; 2937 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 2938 x5 = x[4+jdx]; x6 = x[5+jdx]; 2939 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2940 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2941 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2942 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2943 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2944 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2945 v += bs2; 2946 } 2947 2948 x[idx] = s1; 2949 x[1+idx] = s2; 2950 x[2+idx] = s3; 2951 x[3+idx] = s4; 2952 x[4+idx] = s5; 2953 x[5+idx] = s6; 2954 } 2955 2956 /* backward solve the upper triangular */ 2957 for (i=n-1; i>=0; i--){ 2958 v = aa + bs2*(adiag[i+1]+1); 2959 vi = aj + adiag[i+1]+1; 2960 nz = adiag[i] - adiag[i+1]-1; 2961 idt = bs*i; 2962 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 2963 s5 = x[4+idt];s6 = x[5+idt]; 2964 for(k=0;k<nz;k++){ 2965 idx = bs*vi[k]; 2966 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 2967 x5 = x[4+idx];x6 = x[5+idx]; 2968 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 2969 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;; 2970 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 2971 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 2972 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 2973 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 2974 v += bs2; 2975 } 2976 /* x = inv_diagonal*x */ 2977 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 2978 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 2979 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 2980 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 2981 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 2982 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 2983 } 2984 2985 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2986 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2987 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 2988 PetscFunctionReturn(0); 2989 } 2990 2991 #undef __FUNCT__ 2992 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 2993 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 2994 { 2995 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2996 IS iscol=a->col,isrow=a->row; 2997 PetscErrorCode ierr; 2998 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 2999 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3000 const MatScalar *aa=a->a,*v; 3001 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3002 const PetscScalar *b; 3003 3004 PetscFunctionBegin; 3005 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3006 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3007 t = a->solve_work; 3008 3009 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3010 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3011 3012 /* forward solve the lower triangular */ 3013 idx = 5*(*r++); 3014 t[0] = b[idx]; t[1] = b[1+idx]; 3015 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3016 for (i=1; i<n; i++) { 3017 v = aa + 25*ai[i]; 3018 vi = aj + ai[i]; 3019 nz = diag[i] - ai[i]; 3020 idx = 5*(*r++); 3021 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3022 s5 = b[4+idx]; 3023 while (nz--) { 3024 idx = 5*(*vi++); 3025 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3026 x4 = t[3+idx];x5 = t[4+idx]; 3027 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3028 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3029 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3030 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3031 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3032 v += 25; 3033 } 3034 idx = 5*i; 3035 t[idx] = s1;t[1+idx] = s2; 3036 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3037 } 3038 /* backward solve the upper triangular */ 3039 for (i=n-1; i>=0; i--){ 3040 v = aa + 25*diag[i] + 25; 3041 vi = aj + diag[i] + 1; 3042 nz = ai[i+1] - diag[i] - 1; 3043 idt = 5*i; 3044 s1 = t[idt]; s2 = t[1+idt]; 3045 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3046 while (nz--) { 3047 idx = 5*(*vi++); 3048 x1 = t[idx]; x2 = t[1+idx]; 3049 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3050 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3051 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3052 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3053 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3054 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3055 v += 25; 3056 } 3057 idc = 5*(*c--); 3058 v = aa + 25*diag[i]; 3059 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3060 v[15]*s4+v[20]*s5; 3061 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3062 v[16]*s4+v[21]*s5; 3063 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3064 v[17]*s4+v[22]*s5; 3065 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3066 v[18]*s4+v[23]*s5; 3067 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3068 v[19]*s4+v[24]*s5; 3069 } 3070 3071 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3072 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3073 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3074 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3075 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3076 PetscFunctionReturn(0); 3077 } 3078 3079 #undef __FUNCT__ 3080 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct" 3081 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx) 3082 { 3083 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3084 IS iscol=a->col,isrow=a->row; 3085 PetscErrorCode ierr; 3086 const PetscInt *r,*c,*rout,*cout; 3087 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 3088 const MatScalar *aa=a->a,*v; 3089 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 3090 const PetscScalar *b; 3091 3092 PetscFunctionBegin; 3093 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3094 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3095 t = a->solve_work; 3096 3097 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3098 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3099 3100 /* forward solve the lower triangular */ 3101 idx = 5*r[0]; 3102 t[0] = b[idx]; t[1] = b[1+idx]; 3103 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 3104 for (i=1; i<n; i++) { 3105 v = aa + 25*ai[i]; 3106 vi = aj + ai[i]; 3107 nz = ai[i+1] - ai[i]; 3108 idx = 5*r[i]; 3109 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3110 s5 = b[4+idx]; 3111 for(m=0;m<nz;m++){ 3112 idx = 5*vi[m]; 3113 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 3114 x4 = t[3+idx];x5 = t[4+idx]; 3115 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3116 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3117 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3118 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3119 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3120 v += 25; 3121 } 3122 idx = 5*i; 3123 t[idx] = s1;t[1+idx] = s2; 3124 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 3125 } 3126 /* backward solve the upper triangular */ 3127 for (i=n-1; i>=0; i--){ 3128 v = aa + 25*(adiag[i+1]+1); 3129 vi = aj + adiag[i+1]+1; 3130 nz = adiag[i] - adiag[i+1] - 1; 3131 idt = 5*i; 3132 s1 = t[idt]; s2 = t[1+idt]; 3133 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 3134 for(m=0;m<nz;m++){ 3135 idx = 5*vi[m]; 3136 x1 = t[idx]; x2 = t[1+idx]; 3137 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 3138 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3139 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3140 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3141 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3142 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3143 v += 25; 3144 } 3145 idc = 5*c[i]; 3146 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 3147 v[15]*s4+v[20]*s5; 3148 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 3149 v[16]*s4+v[21]*s5; 3150 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 3151 v[17]*s4+v[22]*s5; 3152 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 3153 v[18]*s4+v[23]*s5; 3154 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 3155 v[19]*s4+v[24]*s5; 3156 } 3157 3158 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3159 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3160 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3161 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3162 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3163 PetscFunctionReturn(0); 3164 } 3165 3166 #undef __FUNCT__ 3167 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 3168 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 3169 { 3170 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3171 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 3172 PetscErrorCode ierr; 3173 PetscInt *diag = a->diag,jdx; 3174 const MatScalar *aa=a->a,*v; 3175 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3176 const PetscScalar *b; 3177 3178 PetscFunctionBegin; 3179 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3180 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3181 /* forward solve the lower triangular */ 3182 idx = 0; 3183 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3184 for (i=1; i<n; i++) { 3185 v = aa + 25*ai[i]; 3186 vi = aj + ai[i]; 3187 nz = diag[i] - ai[i]; 3188 idx = 5*i; 3189 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3190 while (nz--) { 3191 jdx = 5*(*vi++); 3192 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3193 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3194 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3195 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3196 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3197 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3198 v += 25; 3199 } 3200 x[idx] = s1; 3201 x[1+idx] = s2; 3202 x[2+idx] = s3; 3203 x[3+idx] = s4; 3204 x[4+idx] = s5; 3205 } 3206 /* backward solve the upper triangular */ 3207 for (i=n-1; i>=0; i--){ 3208 v = aa + 25*diag[i] + 25; 3209 vi = aj + diag[i] + 1; 3210 nz = ai[i+1] - diag[i] - 1; 3211 idt = 5*i; 3212 s1 = x[idt]; s2 = x[1+idt]; 3213 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3214 while (nz--) { 3215 idx = 5*(*vi++); 3216 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3217 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3218 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3219 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3220 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3221 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3222 v += 25; 3223 } 3224 v = aa + 25*diag[i]; 3225 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3226 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3227 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3228 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3229 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3230 } 3231 3232 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3233 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3234 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3235 PetscFunctionReturn(0); 3236 } 3237 3238 #undef __FUNCT__ 3239 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct" 3240 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3241 { 3242 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3243 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 3244 PetscErrorCode ierr; 3245 PetscInt jdx; 3246 const MatScalar *aa=a->a,*v; 3247 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 3248 const PetscScalar *b; 3249 3250 PetscFunctionBegin; 3251 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3252 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3253 /* forward solve the lower triangular */ 3254 idx = 0; 3255 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 3256 for (i=1; i<n; i++) { 3257 v = aa + 25*ai[i]; 3258 vi = aj + ai[i]; 3259 nz = ai[i+1] - ai[i]; 3260 idx = 5*i; 3261 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 3262 for(k=0;k<nz;k++) { 3263 jdx = 5*vi[k]; 3264 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 3265 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3266 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3267 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3268 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3269 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3270 v += 25; 3271 } 3272 x[idx] = s1; 3273 x[1+idx] = s2; 3274 x[2+idx] = s3; 3275 x[3+idx] = s4; 3276 x[4+idx] = s5; 3277 } 3278 3279 /* backward solve the upper triangular */ 3280 for (i=n-1; i>=0; i--){ 3281 v = aa + 25*(adiag[i+1]+1); 3282 vi = aj + adiag[i+1]+1; 3283 nz = adiag[i] - adiag[i+1]-1; 3284 idt = 5*i; 3285 s1 = x[idt]; s2 = x[1+idt]; 3286 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 3287 for(k=0;k<nz;k++){ 3288 idx = 5*vi[k]; 3289 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 3290 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 3291 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 3292 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 3293 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 3294 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 3295 v += 25; 3296 } 3297 /* x = inv_diagonal*x */ 3298 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 3299 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 3300 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 3301 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 3302 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 3303 } 3304 3305 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3306 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3307 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 3308 PetscFunctionReturn(0); 3309 } 3310 3311 #undef __FUNCT__ 3312 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 3313 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 3314 { 3315 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3316 IS iscol=a->col,isrow=a->row; 3317 PetscErrorCode ierr; 3318 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3319 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3320 const MatScalar *aa=a->a,*v; 3321 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3322 const PetscScalar *b; 3323 3324 PetscFunctionBegin; 3325 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3326 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3327 t = a->solve_work; 3328 3329 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3330 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3331 3332 /* forward solve the lower triangular */ 3333 idx = 4*(*r++); 3334 t[0] = b[idx]; t[1] = b[1+idx]; 3335 t[2] = b[2+idx]; t[3] = b[3+idx]; 3336 for (i=1; i<n; i++) { 3337 v = aa + 16*ai[i]; 3338 vi = aj + ai[i]; 3339 nz = diag[i] - ai[i]; 3340 idx = 4*(*r++); 3341 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3342 while (nz--) { 3343 idx = 4*(*vi++); 3344 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3345 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3346 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3347 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3348 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3349 v += 16; 3350 } 3351 idx = 4*i; 3352 t[idx] = s1;t[1+idx] = s2; 3353 t[2+idx] = s3;t[3+idx] = s4; 3354 } 3355 /* backward solve the upper triangular */ 3356 for (i=n-1; i>=0; i--){ 3357 v = aa + 16*diag[i] + 16; 3358 vi = aj + diag[i] + 1; 3359 nz = ai[i+1] - diag[i] - 1; 3360 idt = 4*i; 3361 s1 = t[idt]; s2 = t[1+idt]; 3362 s3 = t[2+idt];s4 = t[3+idt]; 3363 while (nz--) { 3364 idx = 4*(*vi++); 3365 x1 = t[idx]; x2 = t[1+idx]; 3366 x3 = t[2+idx]; x4 = t[3+idx]; 3367 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3368 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3369 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3370 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3371 v += 16; 3372 } 3373 idc = 4*(*c--); 3374 v = aa + 16*diag[i]; 3375 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3376 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3377 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3378 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3379 } 3380 3381 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3382 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3383 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3384 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3385 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3386 PetscFunctionReturn(0); 3387 } 3388 3389 #undef __FUNCT__ 3390 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct" 3391 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx) 3392 { 3393 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3394 IS iscol=a->col,isrow=a->row; 3395 PetscErrorCode ierr; 3396 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 3397 const PetscInt *r,*c,*rout,*cout; 3398 const MatScalar *aa=a->a,*v; 3399 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 3400 const PetscScalar *b; 3401 3402 PetscFunctionBegin; 3403 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3404 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3405 t = a->solve_work; 3406 3407 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3408 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 3409 3410 /* forward solve the lower triangular */ 3411 idx = 4*r[0]; 3412 t[0] = b[idx]; t[1] = b[1+idx]; 3413 t[2] = b[2+idx]; t[3] = b[3+idx]; 3414 for (i=1; i<n; i++) { 3415 v = aa + 16*ai[i]; 3416 vi = aj + ai[i]; 3417 nz = ai[i+1] - ai[i]; 3418 idx = 4*r[i]; 3419 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3420 for(m=0;m<nz;m++){ 3421 idx = 4*vi[m]; 3422 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 3423 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3424 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3425 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3426 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3427 v += 16; 3428 } 3429 idx = 4*i; 3430 t[idx] = s1;t[1+idx] = s2; 3431 t[2+idx] = s3;t[3+idx] = s4; 3432 } 3433 /* backward solve the upper triangular */ 3434 for (i=n-1; i>=0; i--){ 3435 v = aa + 16*(adiag[i+1]+1); 3436 vi = aj + adiag[i+1]+1; 3437 nz = adiag[i] - adiag[i+1] - 1; 3438 idt = 4*i; 3439 s1 = t[idt]; s2 = t[1+idt]; 3440 s3 = t[2+idt];s4 = t[3+idt]; 3441 for(m=0;m<nz;m++){ 3442 idx = 4*vi[m]; 3443 x1 = t[idx]; x2 = t[1+idx]; 3444 x3 = t[2+idx]; x4 = t[3+idx]; 3445 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3446 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3447 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3448 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3449 v += 16; 3450 } 3451 idc = 4*c[i]; 3452 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3453 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3454 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3455 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3456 } 3457 3458 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3459 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3460 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3461 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3462 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3463 PetscFunctionReturn(0); 3464 } 3465 3466 #undef __FUNCT__ 3467 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 3468 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 3469 { 3470 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3471 IS iscol=a->col,isrow=a->row; 3472 PetscErrorCode ierr; 3473 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 3474 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3475 const MatScalar *aa=a->a,*v; 3476 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 3477 PetscScalar *x; 3478 const PetscScalar *b; 3479 3480 PetscFunctionBegin; 3481 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3482 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3483 t = (MatScalar *)a->solve_work; 3484 3485 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3486 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3487 3488 /* forward solve the lower triangular */ 3489 idx = 4*(*r++); 3490 t[0] = (MatScalar)b[idx]; 3491 t[1] = (MatScalar)b[1+idx]; 3492 t[2] = (MatScalar)b[2+idx]; 3493 t[3] = (MatScalar)b[3+idx]; 3494 for (i=1; i<n; i++) { 3495 v = aa + 16*ai[i]; 3496 vi = aj + ai[i]; 3497 nz = diag[i] - ai[i]; 3498 idx = 4*(*r++); 3499 s1 = (MatScalar)b[idx]; 3500 s2 = (MatScalar)b[1+idx]; 3501 s3 = (MatScalar)b[2+idx]; 3502 s4 = (MatScalar)b[3+idx]; 3503 while (nz--) { 3504 idx = 4*(*vi++); 3505 x1 = t[idx]; 3506 x2 = t[1+idx]; 3507 x3 = t[2+idx]; 3508 x4 = t[3+idx]; 3509 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3510 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3511 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3512 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3513 v += 16; 3514 } 3515 idx = 4*i; 3516 t[idx] = s1; 3517 t[1+idx] = s2; 3518 t[2+idx] = s3; 3519 t[3+idx] = s4; 3520 } 3521 /* backward solve the upper triangular */ 3522 for (i=n-1; i>=0; i--){ 3523 v = aa + 16*diag[i] + 16; 3524 vi = aj + diag[i] + 1; 3525 nz = ai[i+1] - diag[i] - 1; 3526 idt = 4*i; 3527 s1 = t[idt]; 3528 s2 = t[1+idt]; 3529 s3 = t[2+idt]; 3530 s4 = t[3+idt]; 3531 while (nz--) { 3532 idx = 4*(*vi++); 3533 x1 = t[idx]; 3534 x2 = t[1+idx]; 3535 x3 = t[2+idx]; 3536 x4 = t[3+idx]; 3537 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3538 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3539 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3540 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3541 v += 16; 3542 } 3543 idc = 4*(*c--); 3544 v = aa + 16*diag[i]; 3545 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 3546 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 3547 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 3548 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 3549 x[idc] = (PetscScalar)t[idt]; 3550 x[1+idc] = (PetscScalar)t[1+idt]; 3551 x[2+idc] = (PetscScalar)t[2+idt]; 3552 x[3+idc] = (PetscScalar)t[3+idt]; 3553 } 3554 3555 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3556 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3557 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3558 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3559 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3560 PetscFunctionReturn(0); 3561 } 3562 3563 #if defined (PETSC_HAVE_SSE) 3564 3565 #include PETSC_HAVE_SSE 3566 3567 #undef __FUNCT__ 3568 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 3569 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 3570 { 3571 /* 3572 Note: This code uses demotion of double 3573 to float when performing the mixed-mode computation. 3574 This may not be numerically reasonable for all applications. 3575 */ 3576 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3577 IS iscol=a->col,isrow=a->row; 3578 PetscErrorCode ierr; 3579 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 3580 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3581 MatScalar *aa=a->a,*v; 3582 PetscScalar *x,*b,*t; 3583 3584 /* Make space in temp stack for 16 Byte Aligned arrays */ 3585 float ssealignedspace[11],*tmps,*tmpx; 3586 unsigned long offset; 3587 3588 PetscFunctionBegin; 3589 SSE_SCOPE_BEGIN; 3590 3591 offset = (unsigned long)ssealignedspace % 16; 3592 if (offset) offset = (16 - offset)/4; 3593 tmps = &ssealignedspace[offset]; 3594 tmpx = &ssealignedspace[offset+4]; 3595 PREFETCH_NTA(aa+16*ai[1]); 3596 3597 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3598 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3599 t = a->solve_work; 3600 3601 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3602 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3603 3604 /* forward solve the lower triangular */ 3605 idx = 4*(*r++); 3606 t[0] = b[idx]; t[1] = b[1+idx]; 3607 t[2] = b[2+idx]; t[3] = b[3+idx]; 3608 v = aa + 16*ai[1]; 3609 3610 for (i=1; i<n;) { 3611 PREFETCH_NTA(&v[8]); 3612 vi = aj + ai[i]; 3613 nz = diag[i] - ai[i]; 3614 idx = 4*(*r++); 3615 3616 /* Demote sum from double to float */ 3617 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 3618 LOAD_PS(tmps,XMM7); 3619 3620 while (nz--) { 3621 PREFETCH_NTA(&v[16]); 3622 idx = 4*(*vi++); 3623 3624 /* Demote solution (so far) from double to float */ 3625 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 3626 3627 /* 4x4 Matrix-Vector product with negative accumulation: */ 3628 SSE_INLINE_BEGIN_2(tmpx,v) 3629 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3630 3631 /* First Column */ 3632 SSE_COPY_PS(XMM0,XMM6) 3633 SSE_SHUFFLE(XMM0,XMM0,0x00) 3634 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3635 SSE_SUB_PS(XMM7,XMM0) 3636 3637 /* Second Column */ 3638 SSE_COPY_PS(XMM1,XMM6) 3639 SSE_SHUFFLE(XMM1,XMM1,0x55) 3640 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3641 SSE_SUB_PS(XMM7,XMM1) 3642 3643 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3644 3645 /* Third Column */ 3646 SSE_COPY_PS(XMM2,XMM6) 3647 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3648 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3649 SSE_SUB_PS(XMM7,XMM2) 3650 3651 /* Fourth Column */ 3652 SSE_COPY_PS(XMM3,XMM6) 3653 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3654 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3655 SSE_SUB_PS(XMM7,XMM3) 3656 SSE_INLINE_END_2 3657 3658 v += 16; 3659 } 3660 idx = 4*i; 3661 v = aa + 16*ai[++i]; 3662 PREFETCH_NTA(v); 3663 STORE_PS(tmps,XMM7); 3664 3665 /* Promote result from float to double */ 3666 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 3667 } 3668 /* backward solve the upper triangular */ 3669 idt = 4*(n-1); 3670 ai16 = 16*diag[n-1]; 3671 v = aa + ai16 + 16; 3672 for (i=n-1; i>=0;){ 3673 PREFETCH_NTA(&v[8]); 3674 vi = aj + diag[i] + 1; 3675 nz = ai[i+1] - diag[i] - 1; 3676 3677 /* Demote accumulator from double to float */ 3678 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 3679 LOAD_PS(tmps,XMM7); 3680 3681 while (nz--) { 3682 PREFETCH_NTA(&v[16]); 3683 idx = 4*(*vi++); 3684 3685 /* Demote solution (so far) from double to float */ 3686 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 3687 3688 /* 4x4 Matrix-Vector Product with negative accumulation: */ 3689 SSE_INLINE_BEGIN_2(tmpx,v) 3690 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 3691 3692 /* First Column */ 3693 SSE_COPY_PS(XMM0,XMM6) 3694 SSE_SHUFFLE(XMM0,XMM0,0x00) 3695 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 3696 SSE_SUB_PS(XMM7,XMM0) 3697 3698 /* Second Column */ 3699 SSE_COPY_PS(XMM1,XMM6) 3700 SSE_SHUFFLE(XMM1,XMM1,0x55) 3701 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 3702 SSE_SUB_PS(XMM7,XMM1) 3703 3704 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 3705 3706 /* Third Column */ 3707 SSE_COPY_PS(XMM2,XMM6) 3708 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3709 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 3710 SSE_SUB_PS(XMM7,XMM2) 3711 3712 /* Fourth Column */ 3713 SSE_COPY_PS(XMM3,XMM6) 3714 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3715 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 3716 SSE_SUB_PS(XMM7,XMM3) 3717 SSE_INLINE_END_2 3718 v += 16; 3719 } 3720 v = aa + ai16; 3721 ai16 = 16*diag[--i]; 3722 PREFETCH_NTA(aa+ai16+16); 3723 /* 3724 Scale the result by the diagonal 4x4 block, 3725 which was inverted as part of the factorization 3726 */ 3727 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 3728 /* First Column */ 3729 SSE_COPY_PS(XMM0,XMM7) 3730 SSE_SHUFFLE(XMM0,XMM0,0x00) 3731 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 3732 3733 /* Second Column */ 3734 SSE_COPY_PS(XMM1,XMM7) 3735 SSE_SHUFFLE(XMM1,XMM1,0x55) 3736 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 3737 SSE_ADD_PS(XMM0,XMM1) 3738 3739 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 3740 3741 /* Third Column */ 3742 SSE_COPY_PS(XMM2,XMM7) 3743 SSE_SHUFFLE(XMM2,XMM2,0xAA) 3744 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 3745 SSE_ADD_PS(XMM0,XMM2) 3746 3747 /* Fourth Column */ 3748 SSE_COPY_PS(XMM3,XMM7) 3749 SSE_SHUFFLE(XMM3,XMM3,0xFF) 3750 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 3751 SSE_ADD_PS(XMM0,XMM3) 3752 3753 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 3754 SSE_INLINE_END_3 3755 3756 /* Promote solution from float to double */ 3757 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 3758 3759 /* Apply reordering to t and stream into x. */ 3760 /* This way, x doesn't pollute the cache. */ 3761 /* Be careful with size: 2 doubles = 4 floats! */ 3762 idc = 4*(*c--); 3763 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 3764 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 3765 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 3766 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 3767 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 3768 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 3769 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 3770 SSE_INLINE_END_2 3771 v = aa + ai16 + 16; 3772 idt -= 4; 3773 } 3774 3775 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3776 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3777 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3778 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3779 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3780 SSE_SCOPE_END; 3781 PetscFunctionReturn(0); 3782 } 3783 3784 #endif 3785 3786 3787 /* 3788 Special case where the matrix was ILU(0) factored in the natural 3789 ordering. This eliminates the need for the column and row permutation. 3790 */ 3791 #undef __FUNCT__ 3792 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 3793 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 3794 { 3795 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3796 PetscInt n=a->mbs; 3797 const PetscInt *ai=a->i,*aj=a->j; 3798 PetscErrorCode ierr; 3799 const PetscInt *diag = a->diag; 3800 const MatScalar *aa=a->a; 3801 PetscScalar *x; 3802 const PetscScalar *b; 3803 3804 PetscFunctionBegin; 3805 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3806 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3807 3808 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 3809 { 3810 static PetscScalar w[2000]; /* very BAD need to fix */ 3811 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 3812 } 3813 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 3814 { 3815 static PetscScalar w[2000]; /* very BAD need to fix */ 3816 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 3817 } 3818 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 3819 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 3820 #else 3821 { 3822 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3823 const MatScalar *v; 3824 PetscInt jdx,idt,idx,nz,i,ai16; 3825 const PetscInt *vi; 3826 3827 /* forward solve the lower triangular */ 3828 idx = 0; 3829 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 3830 for (i=1; i<n; i++) { 3831 v = aa + 16*ai[i]; 3832 vi = aj + ai[i]; 3833 nz = diag[i] - ai[i]; 3834 idx += 4; 3835 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3836 while (nz--) { 3837 jdx = 4*(*vi++); 3838 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 3839 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3840 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3841 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3842 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3843 v += 16; 3844 } 3845 x[idx] = s1; 3846 x[1+idx] = s2; 3847 x[2+idx] = s3; 3848 x[3+idx] = s4; 3849 } 3850 /* backward solve the upper triangular */ 3851 idt = 4*(n-1); 3852 for (i=n-1; i>=0; i--){ 3853 ai16 = 16*diag[i]; 3854 v = aa + ai16 + 16; 3855 vi = aj + diag[i] + 1; 3856 nz = ai[i+1] - diag[i] - 1; 3857 s1 = x[idt]; s2 = x[1+idt]; 3858 s3 = x[2+idt];s4 = x[3+idt]; 3859 while (nz--) { 3860 idx = 4*(*vi++); 3861 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 3862 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3863 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3864 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3865 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3866 v += 16; 3867 } 3868 v = aa + ai16; 3869 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3870 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 3871 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3872 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3873 idt -= 4; 3874 } 3875 } 3876 #endif 3877 3878 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3879 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3880 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 3881 PetscFunctionReturn(0); 3882 } 3883 3884 #undef __FUNCT__ 3885 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct" 3886 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 3887 { 3888 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3889 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 3890 PetscErrorCode ierr; 3891 PetscInt idx,jdx,idt; 3892 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 3893 const MatScalar *aa=a->a,*v; 3894 PetscScalar *x; 3895 const PetscScalar *b; 3896 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 3897 3898 PetscFunctionBegin; 3899 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3900 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3901 /* forward solve the lower triangular */ 3902 idx = 0; 3903 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx]; 3904 for (i=1; i<n; i++) { 3905 v = aa + bs2*ai[i]; 3906 vi = aj + ai[i]; 3907 nz = ai[i+1] - ai[i]; 3908 idx = bs*i; 3909 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 3910 for(k=0;k<nz;k++) { 3911 jdx = bs*vi[k]; 3912 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx]; 3913 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3914 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3915 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3916 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3917 3918 v += bs2; 3919 } 3920 3921 x[idx] = s1; 3922 x[1+idx] = s2; 3923 x[2+idx] = s3; 3924 x[3+idx] = s4; 3925 } 3926 3927 /* backward solve the upper triangular */ 3928 for (i=n-1; i>=0; i--){ 3929 v = aa + bs2*(adiag[i+1]+1); 3930 vi = aj + adiag[i+1]+1; 3931 nz = adiag[i] - adiag[i+1]-1; 3932 idt = bs*i; 3933 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt]; 3934 3935 for(k=0;k<nz;k++){ 3936 idx = bs*vi[k]; 3937 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx]; 3938 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 3939 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 3940 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 3941 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 3942 3943 v += bs2; 3944 } 3945 /* x = inv_diagonal*x */ 3946 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 3947 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;; 3948 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 3949 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 3950 3951 } 3952 3953 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3954 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3955 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 3956 PetscFunctionReturn(0); 3957 } 3958 3959 #undef __FUNCT__ 3960 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 3961 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 3962 { 3963 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3964 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3965 PetscErrorCode ierr; 3966 PetscInt *diag = a->diag; 3967 MatScalar *aa=a->a; 3968 PetscScalar *x,*b; 3969 3970 PetscFunctionBegin; 3971 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3972 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3973 3974 { 3975 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 3976 MatScalar *v,*t=(MatScalar *)x; 3977 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 3978 3979 /* forward solve the lower triangular */ 3980 idx = 0; 3981 t[0] = (MatScalar)b[0]; 3982 t[1] = (MatScalar)b[1]; 3983 t[2] = (MatScalar)b[2]; 3984 t[3] = (MatScalar)b[3]; 3985 for (i=1; i<n; i++) { 3986 v = aa + 16*ai[i]; 3987 vi = aj + ai[i]; 3988 nz = diag[i] - ai[i]; 3989 idx += 4; 3990 s1 = (MatScalar)b[idx]; 3991 s2 = (MatScalar)b[1+idx]; 3992 s3 = (MatScalar)b[2+idx]; 3993 s4 = (MatScalar)b[3+idx]; 3994 while (nz--) { 3995 jdx = 4*(*vi++); 3996 x1 = t[jdx]; 3997 x2 = t[1+jdx]; 3998 x3 = t[2+jdx]; 3999 x4 = t[3+jdx]; 4000 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4001 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4002 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4003 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4004 v += 16; 4005 } 4006 t[idx] = s1; 4007 t[1+idx] = s2; 4008 t[2+idx] = s3; 4009 t[3+idx] = s4; 4010 } 4011 /* backward solve the upper triangular */ 4012 idt = 4*(n-1); 4013 for (i=n-1; i>=0; i--){ 4014 ai16 = 16*diag[i]; 4015 v = aa + ai16 + 16; 4016 vi = aj + diag[i] + 1; 4017 nz = ai[i+1] - diag[i] - 1; 4018 s1 = t[idt]; 4019 s2 = t[1+idt]; 4020 s3 = t[2+idt]; 4021 s4 = t[3+idt]; 4022 while (nz--) { 4023 idx = 4*(*vi++); 4024 x1 = (MatScalar)x[idx]; 4025 x2 = (MatScalar)x[1+idx]; 4026 x3 = (MatScalar)x[2+idx]; 4027 x4 = (MatScalar)x[3+idx]; 4028 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 4029 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 4030 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 4031 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 4032 v += 16; 4033 } 4034 v = aa + ai16; 4035 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 4036 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 4037 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 4038 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 4039 idt -= 4; 4040 } 4041 } 4042 4043 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4044 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4045 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4046 PetscFunctionReturn(0); 4047 } 4048 4049 #if defined (PETSC_HAVE_SSE) 4050 4051 #include PETSC_HAVE_SSE 4052 #undef __FUNCT__ 4053 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 4054 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 4055 { 4056 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4057 unsigned short *aj=(unsigned short *)a->j; 4058 PetscErrorCode ierr; 4059 int *ai=a->i,n=a->mbs,*diag = a->diag; 4060 MatScalar *aa=a->a; 4061 PetscScalar *x,*b; 4062 4063 PetscFunctionBegin; 4064 SSE_SCOPE_BEGIN; 4065 /* 4066 Note: This code currently uses demotion of double 4067 to float when performing the mixed-mode computation. 4068 This may not be numerically reasonable for all applications. 4069 */ 4070 PREFETCH_NTA(aa+16*ai[1]); 4071 4072 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4073 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4074 { 4075 /* x will first be computed in single precision then promoted inplace to double */ 4076 MatScalar *v,*t=(MatScalar *)x; 4077 int nz,i,idt,ai16; 4078 unsigned int jdx,idx; 4079 unsigned short *vi; 4080 /* Forward solve the lower triangular factor. */ 4081 4082 /* First block is the identity. */ 4083 idx = 0; 4084 CONVERT_DOUBLE4_FLOAT4(t,b); 4085 v = aa + 16*((unsigned int)ai[1]); 4086 4087 for (i=1; i<n;) { 4088 PREFETCH_NTA(&v[8]); 4089 vi = aj + ai[i]; 4090 nz = diag[i] - ai[i]; 4091 idx += 4; 4092 4093 /* Demote RHS from double to float. */ 4094 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4095 LOAD_PS(&t[idx],XMM7); 4096 4097 while (nz--) { 4098 PREFETCH_NTA(&v[16]); 4099 jdx = 4*((unsigned int)(*vi++)); 4100 4101 /* 4x4 Matrix-Vector product with negative accumulation: */ 4102 SSE_INLINE_BEGIN_2(&t[jdx],v) 4103 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4104 4105 /* First Column */ 4106 SSE_COPY_PS(XMM0,XMM6) 4107 SSE_SHUFFLE(XMM0,XMM0,0x00) 4108 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4109 SSE_SUB_PS(XMM7,XMM0) 4110 4111 /* Second Column */ 4112 SSE_COPY_PS(XMM1,XMM6) 4113 SSE_SHUFFLE(XMM1,XMM1,0x55) 4114 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4115 SSE_SUB_PS(XMM7,XMM1) 4116 4117 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4118 4119 /* Third Column */ 4120 SSE_COPY_PS(XMM2,XMM6) 4121 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4122 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4123 SSE_SUB_PS(XMM7,XMM2) 4124 4125 /* Fourth Column */ 4126 SSE_COPY_PS(XMM3,XMM6) 4127 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4128 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4129 SSE_SUB_PS(XMM7,XMM3) 4130 SSE_INLINE_END_2 4131 4132 v += 16; 4133 } 4134 v = aa + 16*ai[++i]; 4135 PREFETCH_NTA(v); 4136 STORE_PS(&t[idx],XMM7); 4137 } 4138 4139 /* Backward solve the upper triangular factor.*/ 4140 4141 idt = 4*(n-1); 4142 ai16 = 16*diag[n-1]; 4143 v = aa + ai16 + 16; 4144 for (i=n-1; i>=0;){ 4145 PREFETCH_NTA(&v[8]); 4146 vi = aj + diag[i] + 1; 4147 nz = ai[i+1] - diag[i] - 1; 4148 4149 LOAD_PS(&t[idt],XMM7); 4150 4151 while (nz--) { 4152 PREFETCH_NTA(&v[16]); 4153 idx = 4*((unsigned int)(*vi++)); 4154 4155 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4156 SSE_INLINE_BEGIN_2(&t[idx],v) 4157 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4158 4159 /* First Column */ 4160 SSE_COPY_PS(XMM0,XMM6) 4161 SSE_SHUFFLE(XMM0,XMM0,0x00) 4162 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4163 SSE_SUB_PS(XMM7,XMM0) 4164 4165 /* Second Column */ 4166 SSE_COPY_PS(XMM1,XMM6) 4167 SSE_SHUFFLE(XMM1,XMM1,0x55) 4168 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4169 SSE_SUB_PS(XMM7,XMM1) 4170 4171 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4172 4173 /* Third Column */ 4174 SSE_COPY_PS(XMM2,XMM6) 4175 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4176 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4177 SSE_SUB_PS(XMM7,XMM2) 4178 4179 /* Fourth Column */ 4180 SSE_COPY_PS(XMM3,XMM6) 4181 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4182 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4183 SSE_SUB_PS(XMM7,XMM3) 4184 SSE_INLINE_END_2 4185 v += 16; 4186 } 4187 v = aa + ai16; 4188 ai16 = 16*diag[--i]; 4189 PREFETCH_NTA(aa+ai16+16); 4190 /* 4191 Scale the result by the diagonal 4x4 block, 4192 which was inverted as part of the factorization 4193 */ 4194 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4195 /* First Column */ 4196 SSE_COPY_PS(XMM0,XMM7) 4197 SSE_SHUFFLE(XMM0,XMM0,0x00) 4198 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4199 4200 /* Second Column */ 4201 SSE_COPY_PS(XMM1,XMM7) 4202 SSE_SHUFFLE(XMM1,XMM1,0x55) 4203 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4204 SSE_ADD_PS(XMM0,XMM1) 4205 4206 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4207 4208 /* Third Column */ 4209 SSE_COPY_PS(XMM2,XMM7) 4210 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4211 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4212 SSE_ADD_PS(XMM0,XMM2) 4213 4214 /* Fourth Column */ 4215 SSE_COPY_PS(XMM3,XMM7) 4216 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4217 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4218 SSE_ADD_PS(XMM0,XMM3) 4219 4220 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4221 SSE_INLINE_END_3 4222 4223 v = aa + ai16 + 16; 4224 idt -= 4; 4225 } 4226 4227 /* Convert t from single precision back to double precision (inplace)*/ 4228 idt = 4*(n-1); 4229 for (i=n-1;i>=0;i--) { 4230 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4231 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4232 PetscScalar *xtemp=&x[idt]; 4233 MatScalar *ttemp=&t[idt]; 4234 xtemp[3] = (PetscScalar)ttemp[3]; 4235 xtemp[2] = (PetscScalar)ttemp[2]; 4236 xtemp[1] = (PetscScalar)ttemp[1]; 4237 xtemp[0] = (PetscScalar)ttemp[0]; 4238 idt -= 4; 4239 } 4240 4241 } /* End of artificial scope. */ 4242 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4243 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4244 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4245 SSE_SCOPE_END; 4246 PetscFunctionReturn(0); 4247 } 4248 4249 #undef __FUNCT__ 4250 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 4251 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 4252 { 4253 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4254 int *aj=a->j; 4255 PetscErrorCode ierr; 4256 int *ai=a->i,n=a->mbs,*diag = a->diag; 4257 MatScalar *aa=a->a; 4258 PetscScalar *x,*b; 4259 4260 PetscFunctionBegin; 4261 SSE_SCOPE_BEGIN; 4262 /* 4263 Note: This code currently uses demotion of double 4264 to float when performing the mixed-mode computation. 4265 This may not be numerically reasonable for all applications. 4266 */ 4267 PREFETCH_NTA(aa+16*ai[1]); 4268 4269 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 4270 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4271 { 4272 /* x will first be computed in single precision then promoted inplace to double */ 4273 MatScalar *v,*t=(MatScalar *)x; 4274 int nz,i,idt,ai16; 4275 int jdx,idx; 4276 int *vi; 4277 /* Forward solve the lower triangular factor. */ 4278 4279 /* First block is the identity. */ 4280 idx = 0; 4281 CONVERT_DOUBLE4_FLOAT4(t,b); 4282 v = aa + 16*ai[1]; 4283 4284 for (i=1; i<n;) { 4285 PREFETCH_NTA(&v[8]); 4286 vi = aj + ai[i]; 4287 nz = diag[i] - ai[i]; 4288 idx += 4; 4289 4290 /* Demote RHS from double to float. */ 4291 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 4292 LOAD_PS(&t[idx],XMM7); 4293 4294 while (nz--) { 4295 PREFETCH_NTA(&v[16]); 4296 jdx = 4*(*vi++); 4297 /* jdx = *vi++; */ 4298 4299 /* 4x4 Matrix-Vector product with negative accumulation: */ 4300 SSE_INLINE_BEGIN_2(&t[jdx],v) 4301 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4302 4303 /* First Column */ 4304 SSE_COPY_PS(XMM0,XMM6) 4305 SSE_SHUFFLE(XMM0,XMM0,0x00) 4306 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4307 SSE_SUB_PS(XMM7,XMM0) 4308 4309 /* Second Column */ 4310 SSE_COPY_PS(XMM1,XMM6) 4311 SSE_SHUFFLE(XMM1,XMM1,0x55) 4312 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4313 SSE_SUB_PS(XMM7,XMM1) 4314 4315 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4316 4317 /* Third Column */ 4318 SSE_COPY_PS(XMM2,XMM6) 4319 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4320 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4321 SSE_SUB_PS(XMM7,XMM2) 4322 4323 /* Fourth Column */ 4324 SSE_COPY_PS(XMM3,XMM6) 4325 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4326 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4327 SSE_SUB_PS(XMM7,XMM3) 4328 SSE_INLINE_END_2 4329 4330 v += 16; 4331 } 4332 v = aa + 16*ai[++i]; 4333 PREFETCH_NTA(v); 4334 STORE_PS(&t[idx],XMM7); 4335 } 4336 4337 /* Backward solve the upper triangular factor.*/ 4338 4339 idt = 4*(n-1); 4340 ai16 = 16*diag[n-1]; 4341 v = aa + ai16 + 16; 4342 for (i=n-1; i>=0;){ 4343 PREFETCH_NTA(&v[8]); 4344 vi = aj + diag[i] + 1; 4345 nz = ai[i+1] - diag[i] - 1; 4346 4347 LOAD_PS(&t[idt],XMM7); 4348 4349 while (nz--) { 4350 PREFETCH_NTA(&v[16]); 4351 idx = 4*(*vi++); 4352 /* idx = *vi++; */ 4353 4354 /* 4x4 Matrix-Vector Product with negative accumulation: */ 4355 SSE_INLINE_BEGIN_2(&t[idx],v) 4356 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 4357 4358 /* First Column */ 4359 SSE_COPY_PS(XMM0,XMM6) 4360 SSE_SHUFFLE(XMM0,XMM0,0x00) 4361 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 4362 SSE_SUB_PS(XMM7,XMM0) 4363 4364 /* Second Column */ 4365 SSE_COPY_PS(XMM1,XMM6) 4366 SSE_SHUFFLE(XMM1,XMM1,0x55) 4367 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 4368 SSE_SUB_PS(XMM7,XMM1) 4369 4370 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 4371 4372 /* Third Column */ 4373 SSE_COPY_PS(XMM2,XMM6) 4374 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4375 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 4376 SSE_SUB_PS(XMM7,XMM2) 4377 4378 /* Fourth Column */ 4379 SSE_COPY_PS(XMM3,XMM6) 4380 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4381 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 4382 SSE_SUB_PS(XMM7,XMM3) 4383 SSE_INLINE_END_2 4384 v += 16; 4385 } 4386 v = aa + ai16; 4387 ai16 = 16*diag[--i]; 4388 PREFETCH_NTA(aa+ai16+16); 4389 /* 4390 Scale the result by the diagonal 4x4 block, 4391 which was inverted as part of the factorization 4392 */ 4393 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 4394 /* First Column */ 4395 SSE_COPY_PS(XMM0,XMM7) 4396 SSE_SHUFFLE(XMM0,XMM0,0x00) 4397 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 4398 4399 /* Second Column */ 4400 SSE_COPY_PS(XMM1,XMM7) 4401 SSE_SHUFFLE(XMM1,XMM1,0x55) 4402 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 4403 SSE_ADD_PS(XMM0,XMM1) 4404 4405 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 4406 4407 /* Third Column */ 4408 SSE_COPY_PS(XMM2,XMM7) 4409 SSE_SHUFFLE(XMM2,XMM2,0xAA) 4410 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 4411 SSE_ADD_PS(XMM0,XMM2) 4412 4413 /* Fourth Column */ 4414 SSE_COPY_PS(XMM3,XMM7) 4415 SSE_SHUFFLE(XMM3,XMM3,0xFF) 4416 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 4417 SSE_ADD_PS(XMM0,XMM3) 4418 4419 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 4420 SSE_INLINE_END_3 4421 4422 v = aa + ai16 + 16; 4423 idt -= 4; 4424 } 4425 4426 /* Convert t from single precision back to double precision (inplace)*/ 4427 idt = 4*(n-1); 4428 for (i=n-1;i>=0;i--) { 4429 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 4430 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 4431 PetscScalar *xtemp=&x[idt]; 4432 MatScalar *ttemp=&t[idt]; 4433 xtemp[3] = (PetscScalar)ttemp[3]; 4434 xtemp[2] = (PetscScalar)ttemp[2]; 4435 xtemp[1] = (PetscScalar)ttemp[1]; 4436 xtemp[0] = (PetscScalar)ttemp[0]; 4437 idt -= 4; 4438 } 4439 4440 } /* End of artificial scope. */ 4441 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 4442 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4443 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 4444 SSE_SCOPE_END; 4445 PetscFunctionReturn(0); 4446 } 4447 4448 #endif 4449 4450 #undef __FUNCT__ 4451 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 4452 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 4453 { 4454 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4455 IS iscol=a->col,isrow=a->row; 4456 PetscErrorCode ierr; 4457 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4458 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4459 const MatScalar *aa=a->a,*v; 4460 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4461 const PetscScalar *b; 4462 4463 PetscFunctionBegin; 4464 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4465 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4466 t = a->solve_work; 4467 4468 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4469 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4470 4471 /* forward solve the lower triangular */ 4472 idx = 3*(*r++); 4473 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4474 for (i=1; i<n; i++) { 4475 v = aa + 9*ai[i]; 4476 vi = aj + ai[i]; 4477 nz = diag[i] - ai[i]; 4478 idx = 3*(*r++); 4479 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4480 while (nz--) { 4481 idx = 3*(*vi++); 4482 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4483 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4484 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4485 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4486 v += 9; 4487 } 4488 idx = 3*i; 4489 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4490 } 4491 /* backward solve the upper triangular */ 4492 for (i=n-1; i>=0; i--){ 4493 v = aa + 9*diag[i] + 9; 4494 vi = aj + diag[i] + 1; 4495 nz = ai[i+1] - diag[i] - 1; 4496 idt = 3*i; 4497 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4498 while (nz--) { 4499 idx = 3*(*vi++); 4500 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4501 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4502 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4503 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4504 v += 9; 4505 } 4506 idc = 3*(*c--); 4507 v = aa + 9*diag[i]; 4508 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4509 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4510 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4511 } 4512 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4513 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4514 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4515 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4516 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4517 PetscFunctionReturn(0); 4518 } 4519 4520 #undef __FUNCT__ 4521 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct" 4522 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx) 4523 { 4524 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4525 IS iscol=a->col,isrow=a->row; 4526 PetscErrorCode ierr; 4527 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m; 4528 const PetscInt *r,*c,*rout,*cout; 4529 const MatScalar *aa=a->a,*v; 4530 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 4531 const PetscScalar *b; 4532 4533 PetscFunctionBegin; 4534 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4535 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4536 t = a->solve_work; 4537 4538 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4539 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4540 4541 /* forward solve the lower triangular */ 4542 idx = 3*r[0]; 4543 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 4544 for (i=1; i<n; i++) { 4545 v = aa + 9*ai[i]; 4546 vi = aj + ai[i]; 4547 nz = ai[i+1] - ai[i]; 4548 idx = 3*r[i]; 4549 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 4550 for(m=0;m<nz;m++){ 4551 idx = 3*vi[m]; 4552 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4553 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4554 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4555 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4556 v += 9; 4557 } 4558 idx = 3*i; 4559 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 4560 } 4561 /* backward solve the upper triangular */ 4562 for (i=n-1; i>=0; i--){ 4563 v = aa + 9*(adiag[i+1]+1); 4564 vi = aj + adiag[i+1]+1; 4565 nz = adiag[i] - adiag[i+1] - 1; 4566 idt = 3*i; 4567 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 4568 for(m=0;m<nz;m++){ 4569 idx = 3*vi[m]; 4570 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 4571 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4572 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4573 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4574 v += 9; 4575 } 4576 idc = 3*c[i]; 4577 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4578 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4579 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4580 } 4581 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4582 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4583 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4584 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4585 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4586 PetscFunctionReturn(0); 4587 } 4588 4589 /* 4590 Special case where the matrix was ILU(0) factored in the natural 4591 ordering. This eliminates the need for the column and row permutation. 4592 */ 4593 #undef __FUNCT__ 4594 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 4595 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 4596 { 4597 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4598 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4599 PetscErrorCode ierr; 4600 PetscInt *diag = a->diag; 4601 const MatScalar *aa=a->a,*v; 4602 PetscScalar *x,s1,s2,s3,x1,x2,x3; 4603 const PetscScalar *b; 4604 PetscInt jdx,idt,idx,nz,*vi,i; 4605 4606 PetscFunctionBegin; 4607 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4608 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4609 4610 /* forward solve the lower triangular */ 4611 idx = 0; 4612 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 4613 for (i=1; i<n; i++) { 4614 v = aa + 9*ai[i]; 4615 vi = aj + ai[i]; 4616 nz = diag[i] - ai[i]; 4617 idx += 3; 4618 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4619 while (nz--) { 4620 jdx = 3*(*vi++); 4621 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 4622 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4623 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4624 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4625 v += 9; 4626 } 4627 x[idx] = s1; 4628 x[1+idx] = s2; 4629 x[2+idx] = s3; 4630 } 4631 /* backward solve the upper triangular */ 4632 for (i=n-1; i>=0; i--){ 4633 v = aa + 9*diag[i] + 9; 4634 vi = aj + diag[i] + 1; 4635 nz = ai[i+1] - diag[i] - 1; 4636 idt = 3*i; 4637 s1 = x[idt]; s2 = x[1+idt]; 4638 s3 = x[2+idt]; 4639 while (nz--) { 4640 idx = 3*(*vi++); 4641 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 4642 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4643 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4644 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4645 v += 9; 4646 } 4647 v = aa + 9*diag[i]; 4648 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4649 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4650 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4651 } 4652 4653 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4654 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4655 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 4656 PetscFunctionReturn(0); 4657 } 4658 4659 #undef __FUNCT__ 4660 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct" 4661 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4662 { 4663 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4664 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz; 4665 PetscErrorCode ierr; 4666 PetscInt idx,jdx,idt; 4667 PetscInt bs = A->rmap->bs,bs2 = a->bs2; 4668 const MatScalar *aa=a->a,*v; 4669 PetscScalar *x; 4670 const PetscScalar *b; 4671 PetscScalar s1,s2,s3,x1,x2,x3; 4672 4673 PetscFunctionBegin; 4674 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4675 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4676 /* forward solve the lower triangular */ 4677 idx = 0; 4678 x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx]; 4679 for (i=1; i<n; i++) { 4680 v = aa + bs2*ai[i]; 4681 vi = aj + ai[i]; 4682 nz = ai[i+1] - ai[i]; 4683 idx = bs*i; 4684 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 4685 for(k=0;k<nz;k++){ 4686 jdx = bs*vi[k]; 4687 x1 = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx]; 4688 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4689 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4690 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4691 4692 v += bs2; 4693 } 4694 4695 x[idx] = s1; 4696 x[1+idx] = s2; 4697 x[2+idx] = s3; 4698 } 4699 4700 /* backward solve the upper triangular */ 4701 for (i=n-1; i>=0; i--){ 4702 v = aa + bs2*(adiag[i+1]+1); 4703 vi = aj + adiag[i+1]+1; 4704 nz = adiag[i] - adiag[i+1]-1; 4705 idt = bs*i; 4706 s1 = x[idt]; s2 = x[1+idt];s3 = x[2+idt]; 4707 4708 for(k=0;k<nz;k++){ 4709 idx = bs*vi[k]; 4710 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 4711 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 4712 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 4713 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 4714 4715 v += bs2; 4716 } 4717 /* x = inv_diagonal*x */ 4718 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 4719 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 4720 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 4721 4722 } 4723 4724 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4725 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4726 ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr); 4727 PetscFunctionReturn(0); 4728 } 4729 4730 #undef __FUNCT__ 4731 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 4732 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 4733 { 4734 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4735 IS iscol=a->col,isrow=a->row; 4736 PetscErrorCode ierr; 4737 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 4738 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4739 const MatScalar *aa=a->a,*v; 4740 PetscScalar *x,s1,s2,x1,x2,*t; 4741 const PetscScalar *b; 4742 4743 PetscFunctionBegin; 4744 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4745 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4746 t = a->solve_work; 4747 4748 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4749 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 4750 4751 /* forward solve the lower triangular */ 4752 idx = 2*(*r++); 4753 t[0] = b[idx]; t[1] = b[1+idx]; 4754 for (i=1; i<n; i++) { 4755 v = aa + 4*ai[i]; 4756 vi = aj + ai[i]; 4757 nz = diag[i] - ai[i]; 4758 idx = 2*(*r++); 4759 s1 = b[idx]; s2 = b[1+idx]; 4760 while (nz--) { 4761 idx = 2*(*vi++); 4762 x1 = t[idx]; x2 = t[1+idx]; 4763 s1 -= v[0]*x1 + v[2]*x2; 4764 s2 -= v[1]*x1 + v[3]*x2; 4765 v += 4; 4766 } 4767 idx = 2*i; 4768 t[idx] = s1; t[1+idx] = s2; 4769 } 4770 /* backward solve the upper triangular */ 4771 for (i=n-1; i>=0; i--){ 4772 v = aa + 4*diag[i] + 4; 4773 vi = aj + diag[i] + 1; 4774 nz = ai[i+1] - diag[i] - 1; 4775 idt = 2*i; 4776 s1 = t[idt]; s2 = t[1+idt]; 4777 while (nz--) { 4778 idx = 2*(*vi++); 4779 x1 = t[idx]; x2 = t[1+idx]; 4780 s1 -= v[0]*x1 + v[2]*x2; 4781 s2 -= v[1]*x1 + v[3]*x2; 4782 v += 4; 4783 } 4784 idc = 2*(*c--); 4785 v = aa + 4*diag[i]; 4786 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4787 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4788 } 4789 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4790 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4791 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4792 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4793 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4794 PetscFunctionReturn(0); 4795 } 4796 4797 #undef __FUNCT__ 4798 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct" 4799 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx) 4800 { 4801 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4802 IS iscol=a->col,isrow=a->row; 4803 PetscErrorCode ierr; 4804 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m; 4805 const PetscInt *r,*c,*rout,*cout; 4806 const MatScalar *aa=a->a,*v; 4807 PetscScalar *x,s1,s2,x1,x2,*t; 4808 const PetscScalar *b; 4809 4810 PetscFunctionBegin; 4811 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4812 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4813 t = a->solve_work; 4814 4815 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 4816 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 4817 4818 /* forward solve the lower triangular */ 4819 idx = 2*r[0]; 4820 t[0] = b[idx]; t[1] = b[1+idx]; 4821 for (i=1; i<n; i++) { 4822 v = aa + 4*ai[i]; 4823 vi = aj + ai[i]; 4824 nz = ai[i+1] - ai[i]; 4825 idx = 2*r[i]; 4826 s1 = b[idx]; s2 = b[1+idx]; 4827 for(m=0;m<nz;m++){ 4828 jdx = 2*vi[m]; 4829 x1 = t[jdx]; x2 = t[1+jdx]; 4830 s1 -= v[0]*x1 + v[2]*x2; 4831 s2 -= v[1]*x1 + v[3]*x2; 4832 v += 4; 4833 } 4834 idx = 2*i; 4835 t[idx] = s1; t[1+idx] = s2; 4836 } 4837 /* backward solve the upper triangular */ 4838 for (i=n-1; i>=0; i--){ 4839 v = aa + 4*(adiag[i+1]+1); 4840 vi = aj + adiag[i+1]+1; 4841 nz = adiag[i] - adiag[i+1] - 1; 4842 idt = 2*i; 4843 s1 = t[idt]; s2 = t[1+idt]; 4844 for(m=0;m<nz;m++){ 4845 idx = 2*vi[m]; 4846 x1 = t[idx]; x2 = t[1+idx]; 4847 s1 -= v[0]*x1 + v[2]*x2; 4848 s2 -= v[1]*x1 + v[3]*x2; 4849 v += 4; 4850 } 4851 idc = 2*c[i]; 4852 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 4853 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 4854 } 4855 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 4856 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 4857 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4858 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4859 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4860 PetscFunctionReturn(0); 4861 } 4862 4863 /* 4864 Special case where the matrix was ILU(0) factored in the natural 4865 ordering. This eliminates the need for the column and row permutation. 4866 */ 4867 #undef __FUNCT__ 4868 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 4869 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 4870 { 4871 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4872 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 4873 PetscErrorCode ierr; 4874 PetscInt *diag = a->diag; 4875 const MatScalar *aa=a->a,*v; 4876 PetscScalar *x,s1,s2,x1,x2; 4877 const PetscScalar *b; 4878 PetscInt jdx,idt,idx,nz,*vi,i; 4879 4880 PetscFunctionBegin; 4881 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4882 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4883 4884 /* forward solve the lower triangular */ 4885 idx = 0; 4886 x[0] = b[0]; x[1] = b[1]; 4887 for (i=1; i<n; i++) { 4888 v = aa + 4*ai[i]; 4889 vi = aj + ai[i]; 4890 nz = diag[i] - ai[i]; 4891 idx += 2; 4892 s1 = b[idx];s2 = b[1+idx]; 4893 while (nz--) { 4894 jdx = 2*(*vi++); 4895 x1 = x[jdx];x2 = x[1+jdx]; 4896 s1 -= v[0]*x1 + v[2]*x2; 4897 s2 -= v[1]*x1 + v[3]*x2; 4898 v += 4; 4899 } 4900 x[idx] = s1; 4901 x[1+idx] = s2; 4902 } 4903 /* backward solve the upper triangular */ 4904 for (i=n-1; i>=0; i--){ 4905 v = aa + 4*diag[i] + 4; 4906 vi = aj + diag[i] + 1; 4907 nz = ai[i+1] - diag[i] - 1; 4908 idt = 2*i; 4909 s1 = x[idt]; s2 = x[1+idt]; 4910 while (nz--) { 4911 idx = 2*(*vi++); 4912 x1 = x[idx]; x2 = x[1+idx]; 4913 s1 -= v[0]*x1 + v[2]*x2; 4914 s2 -= v[1]*x1 + v[3]*x2; 4915 v += 4; 4916 } 4917 v = aa + 4*diag[i]; 4918 x[idt] = v[0]*s1 + v[2]*s2; 4919 x[1+idt] = v[1]*s1 + v[3]*s2; 4920 } 4921 4922 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4923 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4924 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4925 PetscFunctionReturn(0); 4926 } 4927 4928 #undef __FUNCT__ 4929 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct" 4930 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 4931 { 4932 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 4933 PetscInt i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt; 4934 PetscErrorCode ierr; 4935 PetscInt jdx; 4936 const MatScalar *aa=a->a,*v; 4937 PetscScalar *x,s1,s2,x1,x2; 4938 const PetscScalar *b; 4939 4940 PetscFunctionBegin; 4941 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4942 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 4943 /* forward solve the lower triangular */ 4944 idx = 0; 4945 x[0] = b[idx]; x[1] = b[1+idx]; 4946 for (i=1; i<n; i++) { 4947 v = aa + 4*ai[i]; 4948 vi = aj + ai[i]; 4949 nz = ai[i+1] - ai[i]; 4950 idx = 2*i; 4951 s1 = b[idx];s2 = b[1+idx]; 4952 for(k=0;k<nz;k++){ 4953 jdx = 2*vi[k]; 4954 x1 = x[jdx];x2 = x[1+jdx]; 4955 s1 -= v[0]*x1 + v[2]*x2; 4956 s2 -= v[1]*x1 + v[3]*x2; 4957 v += 4; 4958 } 4959 x[idx] = s1; 4960 x[1+idx] = s2; 4961 } 4962 4963 /* backward solve the upper triangular */ 4964 for (i=n-1; i>=0; i--){ 4965 v = aa + 4*(adiag[i+1]+1); 4966 vi = aj + adiag[i+1]+1; 4967 nz = adiag[i] - adiag[i+1]-1; 4968 idt = 2*i; 4969 s1 = x[idt]; s2 = x[1+idt]; 4970 for(k=0;k<nz;k++){ 4971 idx = 2*vi[k]; 4972 x1 = x[idx]; x2 = x[1+idx]; 4973 s1 -= v[0]*x1 + v[2]*x2; 4974 s2 -= v[1]*x1 + v[3]*x2; 4975 v += 4; 4976 } 4977 /* x = inv_diagonal*x */ 4978 x[idt] = v[0]*s1 + v[2]*s2; 4979 x[1+idt] = v[1]*s1 + v[3]*s2; 4980 } 4981 4982 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 4983 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 4984 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 4985 PetscFunctionReturn(0); 4986 } 4987 4988 #undef __FUNCT__ 4989 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 4990 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 4991 { 4992 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 4993 IS iscol=a->col,isrow=a->row; 4994 PetscErrorCode ierr; 4995 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 4996 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 4997 MatScalar *aa=a->a,*v; 4998 PetscScalar *x,*b,s1,*t; 4999 5000 PetscFunctionBegin; 5001 if (!n) PetscFunctionReturn(0); 5002 5003 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5004 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5005 t = a->solve_work; 5006 5007 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 5008 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 5009 5010 /* forward solve the lower triangular */ 5011 t[0] = b[*r++]; 5012 for (i=1; i<n; i++) { 5013 v = aa + ai[i]; 5014 vi = aj + ai[i]; 5015 nz = diag[i] - ai[i]; 5016 s1 = b[*r++]; 5017 while (nz--) { 5018 s1 -= (*v++)*t[*vi++]; 5019 } 5020 t[i] = s1; 5021 } 5022 /* backward solve the upper triangular */ 5023 for (i=n-1; i>=0; i--){ 5024 v = aa + diag[i] + 1; 5025 vi = aj + diag[i] + 1; 5026 nz = ai[i+1] - diag[i] - 1; 5027 s1 = t[i]; 5028 while (nz--) { 5029 s1 -= (*v++)*t[*vi++]; 5030 } 5031 x[*c--] = t[i] = aa[diag[i]]*s1; 5032 } 5033 5034 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 5035 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 5036 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5037 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5038 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5039 PetscFunctionReturn(0); 5040 } 5041 /* 5042 Special case where the matrix was ILU(0) factored in the natural 5043 ordering. This eliminates the need for the column and row permutation. 5044 */ 5045 #undef __FUNCT__ 5046 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 5047 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 5048 { 5049 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5050 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 5051 PetscErrorCode ierr; 5052 PetscInt *diag = a->diag; 5053 MatScalar *aa=a->a; 5054 PetscScalar *x,*b; 5055 PetscScalar s1,x1; 5056 MatScalar *v; 5057 PetscInt jdx,idt,idx,nz,*vi,i; 5058 5059 PetscFunctionBegin; 5060 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 5061 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 5062 5063 /* forward solve the lower triangular */ 5064 idx = 0; 5065 x[0] = b[0]; 5066 for (i=1; i<n; i++) { 5067 v = aa + ai[i]; 5068 vi = aj + ai[i]; 5069 nz = diag[i] - ai[i]; 5070 idx += 1; 5071 s1 = b[idx]; 5072 while (nz--) { 5073 jdx = *vi++; 5074 x1 = x[jdx]; 5075 s1 -= v[0]*x1; 5076 v += 1; 5077 } 5078 x[idx] = s1; 5079 } 5080 /* backward solve the upper triangular */ 5081 for (i=n-1; i>=0; i--){ 5082 v = aa + diag[i] + 1; 5083 vi = aj + diag[i] + 1; 5084 nz = ai[i+1] - diag[i] - 1; 5085 idt = i; 5086 s1 = x[idt]; 5087 while (nz--) { 5088 idx = *vi++; 5089 x1 = x[idx]; 5090 s1 -= v[0]*x1; 5091 v += 1; 5092 } 5093 v = aa + diag[i]; 5094 x[idt] = v[0]*s1; 5095 } 5096 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 5097 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 5098 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 5099 PetscFunctionReturn(0); 5100 } 5101 5102 /* ----------------------------------------------------------------*/ 5103 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth); 5104 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 5105 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth); 5106 5107 #undef __FUNCT__ 5108 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 5109 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 5110 { 5111 Mat C=B; 5112 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 5113 IS isrow = b->row,isicol = b->icol; 5114 PetscErrorCode ierr; 5115 const PetscInt *r,*ic,*ics; 5116 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 5117 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 5118 MatScalar *rtmp,*pc,*mwork,*v,*pv,*aa=a->a; 5119 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 5120 MatScalar *v_work; 5121 PetscTruth col_identity,row_identity,both_identity; 5122 5123 PetscFunctionBegin; 5124 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5125 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5126 5127 ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 5128 ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr); 5129 ics = ic; 5130 5131 /* generate work space needed by dense LU factorization */ 5132 ierr = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr); 5133 5134 for (i=0; i<n; i++){ 5135 /* zero rtmp */ 5136 /* L part */ 5137 nz = bi[i+1] - bi[i]; 5138 bjtmp = bj + bi[i]; 5139 for (j=0; j<nz; j++){ 5140 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5141 } 5142 5143 /* U part */ 5144 nz = bdiag[i] - bdiag[i+1]; 5145 bjtmp = bj + bdiag[i+1]+1; 5146 for (j=0; j<nz; j++){ 5147 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5148 } 5149 5150 /* load in initial (unfactored row) */ 5151 nz = ai[r[i]+1] - ai[r[i]]; 5152 ajtmp = aj + ai[r[i]]; 5153 v = aa + bs2*ai[r[i]]; 5154 for (j=0; j<nz; j++) { 5155 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 5156 } 5157 5158 /* elimination */ 5159 bjtmp = bj + bi[i]; 5160 nzL = bi[i+1] - bi[i]; 5161 for(k=0;k < nzL;k++) { 5162 row = bjtmp[k]; 5163 pc = rtmp + bs2*row; 5164 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 5165 if (flg) { 5166 pv = b->a + bs2*bdiag[row]; 5167 Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */ 5168 pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */ 5169 pv = b->a + bs2*(bdiag[row+1]+1); 5170 nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */ 5171 for (j=0; j<nz; j++) { 5172 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 5173 } 5174 ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */ 5175 } 5176 } 5177 5178 /* finished row so stick it into b->a */ 5179 /* L part */ 5180 pv = b->a + bs2*bi[i] ; 5181 pj = b->j + bi[i] ; 5182 nz = bi[i+1] - bi[i]; 5183 for (j=0; j<nz; j++) { 5184 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5185 } 5186 5187 /* Mark diagonal and invert diagonal for simplier triangular solves */ 5188 pv = b->a + bs2*bdiag[i]; 5189 pj = b->j + bdiag[i]; 5190 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 5191 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5192 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 5193 5194 /* U part */ 5195 pv = b->a + bs2*(bdiag[i+1]+1); 5196 pj = b->j + bdiag[i+1]+1; 5197 nz = bdiag[i] - bdiag[i+1] - 1; 5198 for (j=0; j<nz; j++){ 5199 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 5200 } 5201 } 5202 5203 ierr = PetscFree(rtmp);CHKERRQ(ierr); 5204 ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr); 5205 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5206 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5207 5208 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5209 ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr); 5210 both_identity = (PetscTruth) (row_identity && col_identity); 5211 if (both_identity){ 5212 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 5213 } else { 5214 C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct; 5215 } 5216 C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N_newdatastruct; 5217 5218 C->assembled = PETSC_TRUE; 5219 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 5220 PetscFunctionReturn(0); 5221 } 5222 5223 /* 5224 ilu(0) with natural ordering under new data structure. 5225 See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description 5226 because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct(). 5227 */ 5228 5229 #undef __FUNCT__ 5230 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 5231 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5232 { 5233 5234 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5235 PetscErrorCode ierr; 5236 PetscInt n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 5237 PetscInt i,j,nz,*bi,*bj,*bdiag,bi_temp; 5238 5239 PetscFunctionBegin; 5240 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr); 5241 b = (Mat_SeqBAIJ*)(fact)->data; 5242 5243 /* allocate matrix arrays for new data structure */ 5244 ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr); 5245 ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5246 b->singlemalloc = PETSC_TRUE; 5247 if (!b->diag){ 5248 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr); 5249 ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr); 5250 } 5251 bdiag = b->diag; 5252 5253 if (n > 0) { 5254 ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr); 5255 } 5256 5257 /* set bi and bj with new data structure */ 5258 bi = b->i; 5259 bj = b->j; 5260 5261 /* L part */ 5262 bi[0] = 0; 5263 for (i=0; i<n; i++){ 5264 nz = adiag[i] - ai[i]; 5265 bi[i+1] = bi[i] + nz; 5266 aj = a->j + ai[i]; 5267 for (j=0; j<nz; j++){ 5268 *bj = aj[j]; bj++; 5269 } 5270 } 5271 5272 /* U part */ 5273 bi_temp = bi[n]; 5274 bdiag[n] = bi[n]-1; 5275 for (i=n-1; i>=0; i--){ 5276 nz = ai[i+1] - adiag[i] - 1; 5277 bi_temp = bi_temp + nz + 1; 5278 aj = a->j + adiag[i] + 1; 5279 for (j=0; j<nz; j++){ 5280 *bj = aj[j]; bj++; 5281 } 5282 /* diag[i] */ 5283 *bj = i; bj++; 5284 bdiag[i] = bi_temp - 1; 5285 } 5286 PetscFunctionReturn(0); 5287 } 5288 5289 #undef __FUNCT__ 5290 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct" 5291 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5292 { 5293 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5294 IS isicol; 5295 PetscErrorCode ierr; 5296 const PetscInt *r,*ic; 5297 PetscInt n=a->mbs,*ai=a->i,*aj=a->j,d; 5298 PetscInt *bi,*cols,nnz,*cols_lvl; 5299 PetscInt *bdiag,prow,fm,nzbd,reallocs=0,dcount=0; 5300 PetscInt i,levels,diagonal_fill; 5301 PetscTruth col_identity,row_identity,both_identity; 5302 PetscReal f; 5303 PetscInt nlnk,*lnk,*lnk_lvl=PETSC_NULL; 5304 PetscBT lnkbt; 5305 PetscInt nzi,*bj,**bj_ptr,**bjlvl_ptr; 5306 PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL; 5307 PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL; 5308 PetscTruth missing; 5309 PetscInt bs=A->rmap->bs,bs2=a->bs2; 5310 5311 PetscFunctionBegin; 5312 if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n); 5313 ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr); 5314 if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d); 5315 5316 f = info->fill; 5317 levels = (PetscInt)info->levels; 5318 diagonal_fill = (PetscInt)info->diagonal_fill; 5319 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5320 5321 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5322 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5323 both_identity = (PetscTruth) (row_identity && col_identity); 5324 5325 if (!levels && both_identity) { 5326 /* special case: ilu(0) with natural ordering */ 5327 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5328 ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 5329 5330 fact->factor = MAT_FACTOR_ILU; 5331 (fact)->info.factor_mallocs = 0; 5332 (fact)->info.fill_ratio_given = info->fill; 5333 (fact)->info.fill_ratio_needed = 1.0; 5334 b = (Mat_SeqBAIJ*)(fact)->data; 5335 b->row = isrow; 5336 b->col = iscol; 5337 b->icol = isicol; 5338 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5339 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5340 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5341 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5342 PetscFunctionReturn(0); 5343 } 5344 5345 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5346 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5347 5348 /* get new row pointers */ 5349 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr); 5350 bi[0] = 0; 5351 /* bdiag is location of diagonal in factor */ 5352 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr); 5353 bdiag[0] = 0; 5354 5355 ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr); 5356 5357 /* create a linked list for storing column indices of the active row */ 5358 nlnk = n + 1; 5359 ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5360 5361 /* initial FreeSpace size is f*(ai[n]+1) */ 5362 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr); 5363 current_space = free_space; 5364 ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr); 5365 current_space_lvl = free_space_lvl; 5366 5367 for (i=0; i<n; i++) { 5368 nzi = 0; 5369 /* copy current row into linked list */ 5370 nnz = ai[r[i]+1] - ai[r[i]]; 5371 if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i); 5372 cols = aj + ai[r[i]]; 5373 lnk[i] = -1; /* marker to indicate if diagonal exists */ 5374 ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr); 5375 nzi += nlnk; 5376 5377 /* make sure diagonal entry is included */ 5378 if (diagonal_fill && lnk[i] == -1) { 5379 fm = n; 5380 while (lnk[fm] < i) fm = lnk[fm]; 5381 lnk[i] = lnk[fm]; /* insert diagonal into linked list */ 5382 lnk[fm] = i; 5383 lnk_lvl[i] = 0; 5384 nzi++; dcount++; 5385 } 5386 5387 /* add pivot rows into the active row */ 5388 nzbd = 0; 5389 prow = lnk[n]; 5390 while (prow < i) { 5391 nnz = bdiag[prow]; 5392 cols = bj_ptr[prow] + nnz + 1; 5393 cols_lvl = bjlvl_ptr[prow] + nnz + 1; 5394 nnz = bi[prow+1] - bi[prow] - nnz - 1; 5395 ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr); 5396 nzi += nlnk; 5397 prow = lnk[prow]; 5398 nzbd++; 5399 } 5400 bdiag[i] = nzbd; 5401 bi[i+1] = bi[i] + nzi; 5402 5403 /* if free space is not available, make more free space */ 5404 if (current_space->local_remaining<nzi) { 5405 nnz = 2*nzi*(n - i); /* estimated and max additional space needed */ 5406 ierr = PetscFreeSpaceGet(nnz,¤t_space);CHKERRQ(ierr); 5407 ierr = PetscFreeSpaceGet(nnz,¤t_space_lvl);CHKERRQ(ierr); 5408 reallocs++; 5409 } 5410 5411 /* copy data into free_space and free_space_lvl, then initialize lnk */ 5412 ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr); 5413 bj_ptr[i] = current_space->array; 5414 bjlvl_ptr[i] = current_space_lvl->array; 5415 5416 /* make sure the active row i has diagonal entry */ 5417 if (*(bj_ptr[i]+bdiag[i]) != i) { 5418 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5419 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i); 5420 } 5421 5422 current_space->array += nzi; 5423 current_space->local_used += nzi; 5424 current_space->local_remaining -= nzi; 5425 current_space_lvl->array += nzi; 5426 current_space_lvl->local_used += nzi; 5427 current_space_lvl->local_remaining -= nzi; 5428 } 5429 5430 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5431 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5432 5433 /* destroy list of free space and other temporary arrays */ 5434 ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr); 5435 5436 /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */ 5437 ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr); 5438 5439 ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr); 5440 ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr); 5441 ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr); 5442 5443 #if defined(PETSC_USE_INFO) 5444 { 5445 PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]); 5446 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr); 5447 ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5448 ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr); 5449 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5450 if (diagonal_fill) { 5451 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr); 5452 } 5453 } 5454 #endif 5455 5456 /* put together the new matrix */ 5457 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5458 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5459 b = (Mat_SeqBAIJ*)(fact)->data; 5460 b->free_a = PETSC_TRUE; 5461 b->free_ij = PETSC_TRUE; 5462 b->singlemalloc = PETSC_FALSE; 5463 ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5464 b->j = bj; 5465 b->i = bi; 5466 b->diag = bdiag; 5467 b->free_diag = PETSC_TRUE; 5468 b->ilen = 0; 5469 b->imax = 0; 5470 b->row = isrow; 5471 b->col = iscol; 5472 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5473 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5474 b->icol = isicol; 5475 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5476 /* In b structure: Free imax, ilen, old a, old j. 5477 Allocate bdiag, solve_work, new a, new j */ 5478 ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr); 5479 b->maxnz = b->nz = bdiag[0]+1; 5480 fact->info.factor_mallocs = reallocs; 5481 fact->info.fill_ratio_given = f; 5482 fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]); 5483 ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr); 5484 PetscFunctionReturn(0); 5485 } 5486 5487 5488 /* 5489 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 5490 except that the data structure of Mat_SeqAIJ is slightly different. 5491 Not a good example of code reuse. 5492 */ 5493 #undef __FUNCT__ 5494 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 5495 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 5496 { 5497 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 5498 IS isicol; 5499 PetscErrorCode ierr; 5500 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 5501 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 5502 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 5503 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 5504 PetscTruth col_identity,row_identity,both_identity,flg; 5505 PetscReal f; 5506 PetscTruth newdatastruct = PETSC_FALSE; 5507 5508 PetscFunctionBegin; 5509 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 5510 if (newdatastruct){ 5511 ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 5512 PetscFunctionReturn(0); 5513 } 5514 5515 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 5516 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 5517 5518 f = info->fill; 5519 levels = (PetscInt)info->levels; 5520 diagonal_fill = (PetscInt)info->diagonal_fill; 5521 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 5522 5523 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 5524 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 5525 both_identity = (PetscTruth) (row_identity && col_identity); 5526 5527 if (!levels && both_identity) { /* special case copy the nonzero structure */ 5528 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr); 5529 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5530 5531 fact->factor = MAT_FACTOR_ILU; 5532 b = (Mat_SeqBAIJ*)fact->data; 5533 b->row = isrow; 5534 b->col = iscol; 5535 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5536 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5537 b->icol = isicol; 5538 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5539 ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5540 PetscFunctionReturn(0); 5541 } 5542 5543 /* general case perform the symbolic factorization */ 5544 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 5545 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 5546 5547 /* get new row pointers */ 5548 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 5549 ainew[0] = 0; 5550 /* don't know how many column pointers are needed so estimate */ 5551 jmax = (PetscInt)(f*ai[n] + 1); 5552 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 5553 /* ajfill is level of fill for each fill entry */ 5554 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 5555 /* fill is a linked list of nonzeros in active row */ 5556 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 5557 /* im is level for each filled value */ 5558 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 5559 /* dloc is location of diagonal in factor */ 5560 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 5561 dloc[0] = 0; 5562 for (prow=0; prow<n; prow++) { 5563 5564 /* copy prow into linked list */ 5565 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 5566 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 5567 xi = aj + ai[r[prow]]; 5568 fill[n] = n; 5569 fill[prow] = -1; /* marker for diagonal entry */ 5570 while (nz--) { 5571 fm = n; 5572 idx = ic[*xi++]; 5573 do { 5574 m = fm; 5575 fm = fill[m]; 5576 } while (fm < idx); 5577 fill[m] = idx; 5578 fill[idx] = fm; 5579 im[idx] = 0; 5580 } 5581 5582 /* make sure diagonal entry is included */ 5583 if (diagonal_fill && fill[prow] == -1) { 5584 fm = n; 5585 while (fill[fm] < prow) fm = fill[fm]; 5586 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 5587 fill[fm] = prow; 5588 im[prow] = 0; 5589 nzf++; 5590 dcount++; 5591 } 5592 5593 nzi = 0; 5594 row = fill[n]; 5595 while (row < prow) { 5596 incrlev = im[row] + 1; 5597 nz = dloc[row]; 5598 xi = ajnew + ainew[row] + nz + 1; 5599 flev = ajfill + ainew[row] + nz + 1; 5600 nnz = ainew[row+1] - ainew[row] - nz - 1; 5601 fm = row; 5602 while (nnz-- > 0) { 5603 idx = *xi++; 5604 if (*flev + incrlev > levels) { 5605 flev++; 5606 continue; 5607 } 5608 do { 5609 m = fm; 5610 fm = fill[m]; 5611 } while (fm < idx); 5612 if (fm != idx) { 5613 im[idx] = *flev + incrlev; 5614 fill[m] = idx; 5615 fill[idx] = fm; 5616 fm = idx; 5617 nzf++; 5618 } else { 5619 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 5620 } 5621 flev++; 5622 } 5623 row = fill[row]; 5624 nzi++; 5625 } 5626 /* copy new filled row into permanent storage */ 5627 ainew[prow+1] = ainew[prow] + nzf; 5628 if (ainew[prow+1] > jmax) { 5629 5630 /* estimate how much additional space we will need */ 5631 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 5632 /* just double the memory each time */ 5633 PetscInt maxadd = jmax; 5634 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 5635 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 5636 jmax += maxadd; 5637 5638 /* allocate a longer ajnew and ajfill */ 5639 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5640 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5641 ierr = PetscFree(ajnew);CHKERRQ(ierr); 5642 ajnew = xitmp; 5643 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 5644 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 5645 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5646 ajfill = xitmp; 5647 reallocate++; /* count how many reallocations are needed */ 5648 } 5649 xitmp = ajnew + ainew[prow]; 5650 flev = ajfill + ainew[prow]; 5651 dloc[prow] = nzi; 5652 fm = fill[n]; 5653 while (nzf--) { 5654 *xitmp++ = fm; 5655 *flev++ = im[fm]; 5656 fm = fill[fm]; 5657 } 5658 /* make sure row has diagonal entry */ 5659 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 5660 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 5661 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 5662 } 5663 } 5664 ierr = PetscFree(ajfill);CHKERRQ(ierr); 5665 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 5666 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 5667 ierr = PetscFree(fill);CHKERRQ(ierr); 5668 ierr = PetscFree(im);CHKERRQ(ierr); 5669 5670 #if defined(PETSC_USE_INFO) 5671 { 5672 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 5673 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 5674 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 5675 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 5676 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 5677 if (diagonal_fill) { 5678 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 5679 } 5680 } 5681 #endif 5682 5683 /* put together the new matrix */ 5684 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 5685 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 5686 b = (Mat_SeqBAIJ*)fact->data; 5687 b->free_a = PETSC_TRUE; 5688 b->free_ij = PETSC_TRUE; 5689 b->singlemalloc = PETSC_FALSE; 5690 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 5691 b->j = ajnew; 5692 b->i = ainew; 5693 for (i=0; i<n; i++) dloc[i] += ainew[i]; 5694 b->diag = dloc; 5695 b->free_diag = PETSC_TRUE; 5696 b->ilen = 0; 5697 b->imax = 0; 5698 b->row = isrow; 5699 b->col = iscol; 5700 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 5701 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 5702 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 5703 b->icol = isicol; 5704 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 5705 /* In b structure: Free imax, ilen, old a, old j. 5706 Allocate dloc, solve_work, new a, new j */ 5707 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 5708 b->maxnz = b->nz = ainew[n]; 5709 5710 fact->info.factor_mallocs = reallocate; 5711 fact->info.fill_ratio_given = f; 5712 fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 5713 5714 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 5715 PetscFunctionReturn(0); 5716 } 5717 5718 #undef __FUNCT__ 5719 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 5720 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 5721 { 5722 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 5723 /* int i,*AJ=a->j,nz=a->nz; */ 5724 PetscFunctionBegin; 5725 /* Undo Column scaling */ 5726 /* while (nz--) { */ 5727 /* AJ[i] = AJ[i]/4; */ 5728 /* } */ 5729 /* This should really invoke a push/pop logic, but we don't have that yet. */ 5730 A->ops->setunfactored = PETSC_NULL; 5731 PetscFunctionReturn(0); 5732 } 5733 5734 #undef __FUNCT__ 5735 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 5736 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 5737 { 5738 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 5739 PetscInt *AJ=a->j,nz=a->nz; 5740 unsigned short *aj=(unsigned short *)AJ; 5741 PetscFunctionBegin; 5742 /* Is this really necessary? */ 5743 while (nz--) { 5744 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 5745 } 5746 A->ops->setunfactored = PETSC_NULL; 5747 PetscFunctionReturn(0); 5748 } 5749 5750 5751