1 #define PETSCMAT_DLL 2 3 4 /* 5 Factorization code for BAIJ format. 6 */ 7 8 #include "../src/mat/impls/baij/seq/baij.h" 9 #include "../src/mat/blockinvert.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 PetscErrorCode ierr; 17 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18 PetscInt *diag = a->diag; 19 MatScalar *aa=a->a,*v; 20 PetscScalar s1,*x,*b; 21 22 PetscFunctionBegin; 23 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 24 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 25 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 26 27 /* forward solve the U^T */ 28 for (i=0; i<n; i++) { 29 30 v = aa + diag[i]; 31 /* multiply by the inverse of the block diagonal */ 32 s1 = (*v++)*x[i]; 33 vi = aj + diag[i] + 1; 34 nz = ai[i+1] - diag[i] - 1; 35 while (nz--) { 36 x[*vi++] -= (*v++)*s1; 37 } 38 x[i] = s1; 39 } 40 /* backward solve the L^T */ 41 for (i=n-1; i>=0; i--){ 42 v = aa + diag[i] - 1; 43 vi = aj + diag[i] - 1; 44 nz = diag[i] - ai[i]; 45 s1 = x[i]; 46 while (nz--) { 47 x[*vi--] -= (*v--)*s1; 48 } 49 } 50 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 51 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 52 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 53 PetscFunctionReturn(0); 54 } 55 56 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 57 { 58 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 59 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 60 PetscErrorCode ierr; 61 PetscInt jdx; 62 const MatScalar *aa=a->a,*v; 63 PetscScalar *x,s1,s2,x1,x2; 64 const PetscScalar *b; 65 66 PetscFunctionBegin; 67 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 68 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 69 /* forward solve the lower triangular */ 70 idx = 0; 71 x[0] = b[idx]; x[1] = b[1+idx]; 72 for (i=1; i<n; i++) { 73 v = aa + 4*ai[i]; 74 vi = aj + ai[i]; 75 nz = ai[i+1] - ai[i]; 76 idx = 2*i; 77 s1 = b[idx];s2 = b[1+idx]; 78 while (nz--) { 79 jdx = 2*(*vi++); 80 x1 = x[jdx];x2 = x[1+jdx]; 81 s1 -= v[0]*x1 + v[2]*x2; 82 s2 -= v[1]*x1 + v[3]*x2; 83 v += 4; 84 } 85 x[idx] = s1; 86 x[1+idx] = s2; 87 } 88 89 /* backward solve the upper triangular */ 90 for (i=n-1; i>=0; i--){ 91 v = aa + 4*ai[2*n-i]; 92 vi = aj + ai[2*n-i]; 93 nz = ai[2*n-i +1] - ai[2*n-i]-1; 94 idt = 2*i; 95 s1 = x[idt]; s2 = x[1+idt]; 96 while (nz--) { 97 idx = 2*(*vi++); 98 x1 = x[idx]; x2 = x[1+idx]; 99 s1 -= v[0]*x1 + v[2]*x2; 100 s2 -= v[1]*x1 + v[3]*x2; 101 v += 4; 102 } 103 /* x = inv_diagonal*x */ 104 x[idt] = v[0]*s1 + v[2]*s2; 105 x[1+idt] = v[1]*s1 + v[3]*s2; 106 } 107 108 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 109 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 110 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 111 PetscFunctionReturn(0); 112 } 113 114 #undef __FUNCT__ 115 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 116 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 117 { 118 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 119 PetscErrorCode ierr; 120 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 121 PetscInt *diag = a->diag,oidx; 122 MatScalar *aa=a->a,*v; 123 PetscScalar s1,s2,x1,x2; 124 PetscScalar *x,*b; 125 126 PetscFunctionBegin; 127 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 128 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 129 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 130 131 /* forward solve the U^T */ 132 idx = 0; 133 for (i=0; i<n; i++) { 134 135 v = aa + 4*diag[i]; 136 /* multiply by the inverse of the block diagonal */ 137 x1 = x[idx]; x2 = x[1+idx]; 138 s1 = v[0]*x1 + v[1]*x2; 139 s2 = v[2]*x1 + v[3]*x2; 140 v += 4; 141 142 vi = aj + diag[i] + 1; 143 nz = ai[i+1] - diag[i] - 1; 144 while (nz--) { 145 oidx = 2*(*vi++); 146 x[oidx] -= v[0]*s1 + v[1]*s2; 147 x[oidx+1] -= v[2]*s1 + v[3]*s2; 148 v += 4; 149 } 150 x[idx] = s1;x[1+idx] = s2; 151 idx += 2; 152 } 153 /* backward solve the L^T */ 154 for (i=n-1; i>=0; i--){ 155 v = aa + 4*diag[i] - 4; 156 vi = aj + diag[i] - 1; 157 nz = diag[i] - ai[i]; 158 idt = 2*i; 159 s1 = x[idt]; s2 = x[1+idt]; 160 while (nz--) { 161 idx = 2*(*vi--); 162 x[idx] -= v[0]*s1 + v[1]*s2; 163 x[idx+1] -= v[2]*s1 + v[3]*s2; 164 v -= 4; 165 } 166 } 167 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 168 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 169 ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 170 PetscFunctionReturn(0); 171 } 172 173 #undef __FUNCT__ 174 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 175 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 176 { 177 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 178 PetscErrorCode ierr; 179 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 180 PetscInt *diag = a->diag,oidx; 181 MatScalar *aa=a->a,*v; 182 PetscScalar s1,s2,s3,x1,x2,x3; 183 PetscScalar *x,*b; 184 185 PetscFunctionBegin; 186 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 187 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 188 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 189 190 /* forward solve the U^T */ 191 idx = 0; 192 for (i=0; i<n; i++) { 193 194 v = aa + 9*diag[i]; 195 /* multiply by the inverse of the block diagonal */ 196 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 197 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 198 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 199 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 200 v += 9; 201 202 vi = aj + diag[i] + 1; 203 nz = ai[i+1] - diag[i] - 1; 204 while (nz--) { 205 oidx = 3*(*vi++); 206 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 207 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 208 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 209 v += 9; 210 } 211 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 212 idx += 3; 213 } 214 /* backward solve the L^T */ 215 for (i=n-1; i>=0; i--){ 216 v = aa + 9*diag[i] - 9; 217 vi = aj + diag[i] - 1; 218 nz = diag[i] - ai[i]; 219 idt = 3*i; 220 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 221 while (nz--) { 222 idx = 3*(*vi--); 223 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 224 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 225 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 226 v -= 9; 227 } 228 } 229 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 230 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 231 ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 232 PetscFunctionReturn(0); 233 } 234 235 #undef __FUNCT__ 236 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 237 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 238 { 239 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 240 PetscErrorCode ierr; 241 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 242 PetscInt *diag = a->diag,oidx; 243 MatScalar *aa=a->a,*v; 244 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 245 PetscScalar *x,*b; 246 247 PetscFunctionBegin; 248 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 249 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 250 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 251 252 /* forward solve the U^T */ 253 idx = 0; 254 for (i=0; i<n; i++) { 255 256 v = aa + 16*diag[i]; 257 /* multiply by the inverse of the block diagonal */ 258 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 259 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 260 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 261 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 262 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 263 v += 16; 264 265 vi = aj + diag[i] + 1; 266 nz = ai[i+1] - diag[i] - 1; 267 while (nz--) { 268 oidx = 4*(*vi++); 269 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 270 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 271 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 272 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 273 v += 16; 274 } 275 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 276 idx += 4; 277 } 278 /* backward solve the L^T */ 279 for (i=n-1; i>=0; i--){ 280 v = aa + 16*diag[i] - 16; 281 vi = aj + diag[i] - 1; 282 nz = diag[i] - ai[i]; 283 idt = 4*i; 284 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 285 while (nz--) { 286 idx = 4*(*vi--); 287 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 288 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 289 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 290 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 291 v -= 16; 292 } 293 } 294 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 295 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 296 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 297 PetscFunctionReturn(0); 298 } 299 300 #undef __FUNCT__ 301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 302 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 303 { 304 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 305 PetscErrorCode ierr; 306 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 307 PetscInt *diag = a->diag,oidx; 308 MatScalar *aa=a->a,*v; 309 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 310 PetscScalar *x,*b; 311 312 PetscFunctionBegin; 313 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 314 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 315 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 316 317 /* forward solve the U^T */ 318 idx = 0; 319 for (i=0; i<n; i++) { 320 321 v = aa + 25*diag[i]; 322 /* multiply by the inverse of the block diagonal */ 323 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 324 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 325 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 326 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 327 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 328 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 329 v += 25; 330 331 vi = aj + diag[i] + 1; 332 nz = ai[i+1] - diag[i] - 1; 333 while (nz--) { 334 oidx = 5*(*vi++); 335 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 336 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 337 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 338 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 339 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 340 v += 25; 341 } 342 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 343 idx += 5; 344 } 345 /* backward solve the L^T */ 346 for (i=n-1; i>=0; i--){ 347 v = aa + 25*diag[i] - 25; 348 vi = aj + diag[i] - 1; 349 nz = diag[i] - ai[i]; 350 idt = 5*i; 351 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 352 while (nz--) { 353 idx = 5*(*vi--); 354 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 355 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 356 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 357 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 358 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 359 v -= 25; 360 } 361 } 362 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 363 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 364 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 365 PetscFunctionReturn(0); 366 } 367 368 #undef __FUNCT__ 369 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 370 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 371 { 372 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 373 PetscErrorCode ierr; 374 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 375 PetscInt *diag = a->diag,oidx; 376 MatScalar *aa=a->a,*v; 377 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 378 PetscScalar *x,*b; 379 380 PetscFunctionBegin; 381 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 382 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 383 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 384 385 /* forward solve the U^T */ 386 idx = 0; 387 for (i=0; i<n; i++) { 388 389 v = aa + 36*diag[i]; 390 /* multiply by the inverse of the block diagonal */ 391 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 392 x6 = x[5+idx]; 393 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 394 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 395 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 396 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 397 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 398 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 399 v += 36; 400 401 vi = aj + diag[i] + 1; 402 nz = ai[i+1] - diag[i] - 1; 403 while (nz--) { 404 oidx = 6*(*vi++); 405 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 406 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 407 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 408 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 409 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 410 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 411 v += 36; 412 } 413 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 414 x[5+idx] = s6; 415 idx += 6; 416 } 417 /* backward solve the L^T */ 418 for (i=n-1; i>=0; i--){ 419 v = aa + 36*diag[i] - 36; 420 vi = aj + diag[i] - 1; 421 nz = diag[i] - ai[i]; 422 idt = 6*i; 423 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 424 s6 = x[5+idt]; 425 while (nz--) { 426 idx = 6*(*vi--); 427 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 428 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 429 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 430 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 431 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 432 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 433 v -= 36; 434 } 435 } 436 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 437 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 438 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 439 PetscFunctionReturn(0); 440 } 441 442 #undef __FUNCT__ 443 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 444 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 445 { 446 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 447 PetscErrorCode ierr; 448 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 449 PetscInt *diag = a->diag,oidx; 450 MatScalar *aa=a->a,*v; 451 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 452 PetscScalar *x,*b; 453 454 PetscFunctionBegin; 455 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 456 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 457 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 458 459 /* forward solve the U^T */ 460 idx = 0; 461 for (i=0; i<n; i++) { 462 463 v = aa + 49*diag[i]; 464 /* multiply by the inverse of the block diagonal */ 465 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 466 x6 = x[5+idx]; x7 = x[6+idx]; 467 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 468 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 469 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 470 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 471 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 472 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 473 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 474 v += 49; 475 476 vi = aj + diag[i] + 1; 477 nz = ai[i+1] - diag[i] - 1; 478 while (nz--) { 479 oidx = 7*(*vi++); 480 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 481 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 482 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 483 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 484 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 485 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 486 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 487 v += 49; 488 } 489 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 490 x[5+idx] = s6;x[6+idx] = s7; 491 idx += 7; 492 } 493 /* backward solve the L^T */ 494 for (i=n-1; i>=0; i--){ 495 v = aa + 49*diag[i] - 49; 496 vi = aj + diag[i] - 1; 497 nz = diag[i] - ai[i]; 498 idt = 7*i; 499 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 500 s6 = x[5+idt];s7 = x[6+idt]; 501 while (nz--) { 502 idx = 7*(*vi--); 503 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 504 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 505 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 506 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 507 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 508 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 509 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 510 v -= 49; 511 } 512 } 513 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 514 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 515 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 516 PetscFunctionReturn(0); 517 } 518 519 /*---------------------------------------------------------------------------------------------*/ 520 #undef __FUNCT__ 521 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 522 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 523 { 524 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 525 IS iscol=a->col,isrow=a->row; 526 PetscErrorCode ierr; 527 const PetscInt *r,*c,*rout,*cout; 528 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 529 PetscInt *diag = a->diag; 530 MatScalar *aa=a->a,*v; 531 PetscScalar s1,*x,*b,*t; 532 533 PetscFunctionBegin; 534 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 535 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 536 t = a->solve_work; 537 538 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 539 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 540 541 /* copy the b into temp work space according to permutation */ 542 for (i=0; i<n; i++) { 543 t[i] = b[c[i]]; 544 } 545 546 /* forward solve the U^T */ 547 for (i=0; i<n; i++) { 548 549 v = aa + diag[i]; 550 /* multiply by the inverse of the block diagonal */ 551 s1 = (*v++)*t[i]; 552 vi = aj + diag[i] + 1; 553 nz = ai[i+1] - diag[i] - 1; 554 while (nz--) { 555 t[*vi++] -= (*v++)*s1; 556 } 557 t[i] = s1; 558 } 559 /* backward solve the L^T */ 560 for (i=n-1; i>=0; i--){ 561 v = aa + diag[i] - 1; 562 vi = aj + diag[i] - 1; 563 nz = diag[i] - ai[i]; 564 s1 = t[i]; 565 while (nz--) { 566 t[*vi--] -= (*v--)*s1; 567 } 568 } 569 570 /* copy t into x according to permutation */ 571 for (i=0; i<n; i++) { 572 x[r[i]] = t[i]; 573 } 574 575 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 576 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 577 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 578 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 579 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 580 PetscFunctionReturn(0); 581 } 582 583 #undef __FUNCT__ 584 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 585 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 586 { 587 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 588 IS iscol=a->col,isrow=a->row; 589 PetscErrorCode ierr; 590 const PetscInt *r,*c,*rout,*cout; 591 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 592 PetscInt *diag = a->diag,ii,ic,ir,oidx; 593 MatScalar *aa=a->a,*v; 594 PetscScalar s1,s2,x1,x2; 595 PetscScalar *x,*b,*t; 596 597 PetscFunctionBegin; 598 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 599 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 600 t = a->solve_work; 601 602 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 603 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 604 605 /* copy the b into temp work space according to permutation */ 606 ii = 0; 607 for (i=0; i<n; i++) { 608 ic = 2*c[i]; 609 t[ii] = b[ic]; 610 t[ii+1] = b[ic+1]; 611 ii += 2; 612 } 613 614 /* forward solve the U^T */ 615 idx = 0; 616 for (i=0; i<n; i++) { 617 618 v = aa + 4*diag[i]; 619 /* multiply by the inverse of the block diagonal */ 620 x1 = t[idx]; x2 = t[1+idx]; 621 s1 = v[0]*x1 + v[1]*x2; 622 s2 = v[2]*x1 + v[3]*x2; 623 v += 4; 624 625 vi = aj + diag[i] + 1; 626 nz = ai[i+1] - diag[i] - 1; 627 while (nz--) { 628 oidx = 2*(*vi++); 629 t[oidx] -= v[0]*s1 + v[1]*s2; 630 t[oidx+1] -= v[2]*s1 + v[3]*s2; 631 v += 4; 632 } 633 t[idx] = s1;t[1+idx] = s2; 634 idx += 2; 635 } 636 /* backward solve the L^T */ 637 for (i=n-1; i>=0; i--){ 638 v = aa + 4*diag[i] - 4; 639 vi = aj + diag[i] - 1; 640 nz = diag[i] - ai[i]; 641 idt = 2*i; 642 s1 = t[idt]; s2 = t[1+idt]; 643 while (nz--) { 644 idx = 2*(*vi--); 645 t[idx] -= v[0]*s1 + v[1]*s2; 646 t[idx+1] -= v[2]*s1 + v[3]*s2; 647 v -= 4; 648 } 649 } 650 651 /* copy t into x according to permutation */ 652 ii = 0; 653 for (i=0; i<n; i++) { 654 ir = 2*r[i]; 655 x[ir] = t[ii]; 656 x[ir+1] = t[ii+1]; 657 ii += 2; 658 } 659 660 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 661 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 662 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 663 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 664 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 665 PetscFunctionReturn(0); 666 } 667 668 #undef __FUNCT__ 669 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 670 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 671 { 672 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 673 IS iscol=a->col,isrow=a->row; 674 PetscErrorCode ierr; 675 const PetscInt *r,*c,*rout,*cout; 676 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 677 PetscInt *diag = a->diag,ii,ic,ir,oidx; 678 MatScalar *aa=a->a,*v; 679 PetscScalar s1,s2,s3,x1,x2,x3; 680 PetscScalar *x,*b,*t; 681 682 PetscFunctionBegin; 683 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 684 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 685 t = a->solve_work; 686 687 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 688 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 689 690 /* copy the b into temp work space according to permutation */ 691 ii = 0; 692 for (i=0; i<n; i++) { 693 ic = 3*c[i]; 694 t[ii] = b[ic]; 695 t[ii+1] = b[ic+1]; 696 t[ii+2] = b[ic+2]; 697 ii += 3; 698 } 699 700 /* forward solve the U^T */ 701 idx = 0; 702 for (i=0; i<n; i++) { 703 704 v = aa + 9*diag[i]; 705 /* multiply by the inverse of the block diagonal */ 706 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 707 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 708 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 709 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 710 v += 9; 711 712 vi = aj + diag[i] + 1; 713 nz = ai[i+1] - diag[i] - 1; 714 while (nz--) { 715 oidx = 3*(*vi++); 716 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 717 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 718 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 719 v += 9; 720 } 721 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 722 idx += 3; 723 } 724 /* backward solve the L^T */ 725 for (i=n-1; i>=0; i--){ 726 v = aa + 9*diag[i] - 9; 727 vi = aj + diag[i] - 1; 728 nz = diag[i] - ai[i]; 729 idt = 3*i; 730 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 731 while (nz--) { 732 idx = 3*(*vi--); 733 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 734 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 735 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 736 v -= 9; 737 } 738 } 739 740 /* copy t into x according to permutation */ 741 ii = 0; 742 for (i=0; i<n; i++) { 743 ir = 3*r[i]; 744 x[ir] = t[ii]; 745 x[ir+1] = t[ii+1]; 746 x[ir+2] = t[ii+2]; 747 ii += 3; 748 } 749 750 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 751 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 752 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 753 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 754 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 755 PetscFunctionReturn(0); 756 } 757 758 #undef __FUNCT__ 759 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 760 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 761 { 762 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 763 IS iscol=a->col,isrow=a->row; 764 PetscErrorCode ierr; 765 const PetscInt *r,*c,*rout,*cout; 766 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 767 PetscInt *diag = a->diag,ii,ic,ir,oidx; 768 MatScalar *aa=a->a,*v; 769 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 770 PetscScalar *x,*b,*t; 771 772 PetscFunctionBegin; 773 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 774 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 775 t = a->solve_work; 776 777 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 778 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 779 780 /* copy the b into temp work space according to permutation */ 781 ii = 0; 782 for (i=0; i<n; i++) { 783 ic = 4*c[i]; 784 t[ii] = b[ic]; 785 t[ii+1] = b[ic+1]; 786 t[ii+2] = b[ic+2]; 787 t[ii+3] = b[ic+3]; 788 ii += 4; 789 } 790 791 /* forward solve the U^T */ 792 idx = 0; 793 for (i=0; i<n; i++) { 794 795 v = aa + 16*diag[i]; 796 /* multiply by the inverse of the block diagonal */ 797 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 798 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 799 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 800 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 801 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 802 v += 16; 803 804 vi = aj + diag[i] + 1; 805 nz = ai[i+1] - diag[i] - 1; 806 while (nz--) { 807 oidx = 4*(*vi++); 808 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 809 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 810 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 811 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 812 v += 16; 813 } 814 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 815 idx += 4; 816 } 817 /* backward solve the L^T */ 818 for (i=n-1; i>=0; i--){ 819 v = aa + 16*diag[i] - 16; 820 vi = aj + diag[i] - 1; 821 nz = diag[i] - ai[i]; 822 idt = 4*i; 823 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 824 while (nz--) { 825 idx = 4*(*vi--); 826 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 827 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 828 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 829 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 830 v -= 16; 831 } 832 } 833 834 /* copy t into x according to permutation */ 835 ii = 0; 836 for (i=0; i<n; i++) { 837 ir = 4*r[i]; 838 x[ir] = t[ii]; 839 x[ir+1] = t[ii+1]; 840 x[ir+2] = t[ii+2]; 841 x[ir+3] = t[ii+3]; 842 ii += 4; 843 } 844 845 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 846 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 847 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 848 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 849 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 850 PetscFunctionReturn(0); 851 } 852 853 #undef __FUNCT__ 854 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 855 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 856 { 857 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 858 IS iscol=a->col,isrow=a->row; 859 PetscErrorCode ierr; 860 const PetscInt *r,*c,*rout,*cout; 861 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 862 PetscInt *diag = a->diag,ii,ic,ir,oidx; 863 MatScalar *aa=a->a,*v; 864 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 865 PetscScalar *x,*b,*t; 866 867 PetscFunctionBegin; 868 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 869 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 870 t = a->solve_work; 871 872 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 873 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 874 875 /* copy the b into temp work space according to permutation */ 876 ii = 0; 877 for (i=0; i<n; i++) { 878 ic = 5*c[i]; 879 t[ii] = b[ic]; 880 t[ii+1] = b[ic+1]; 881 t[ii+2] = b[ic+2]; 882 t[ii+3] = b[ic+3]; 883 t[ii+4] = b[ic+4]; 884 ii += 5; 885 } 886 887 /* forward solve the U^T */ 888 idx = 0; 889 for (i=0; i<n; i++) { 890 891 v = aa + 25*diag[i]; 892 /* multiply by the inverse of the block diagonal */ 893 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 894 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 895 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 896 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 897 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 898 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 899 v += 25; 900 901 vi = aj + diag[i] + 1; 902 nz = ai[i+1] - diag[i] - 1; 903 while (nz--) { 904 oidx = 5*(*vi++); 905 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 906 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 907 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 908 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 909 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 910 v += 25; 911 } 912 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 913 idx += 5; 914 } 915 /* backward solve the L^T */ 916 for (i=n-1; i>=0; i--){ 917 v = aa + 25*diag[i] - 25; 918 vi = aj + diag[i] - 1; 919 nz = diag[i] - ai[i]; 920 idt = 5*i; 921 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 922 while (nz--) { 923 idx = 5*(*vi--); 924 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 925 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 926 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 927 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 928 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 929 v -= 25; 930 } 931 } 932 933 /* copy t into x according to permutation */ 934 ii = 0; 935 for (i=0; i<n; i++) { 936 ir = 5*r[i]; 937 x[ir] = t[ii]; 938 x[ir+1] = t[ii+1]; 939 x[ir+2] = t[ii+2]; 940 x[ir+3] = t[ii+3]; 941 x[ir+4] = t[ii+4]; 942 ii += 5; 943 } 944 945 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 946 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 947 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 948 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 949 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 950 PetscFunctionReturn(0); 951 } 952 953 #undef __FUNCT__ 954 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 955 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 956 { 957 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 958 IS iscol=a->col,isrow=a->row; 959 PetscErrorCode ierr; 960 const PetscInt *r,*c,*rout,*cout; 961 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 962 PetscInt *diag = a->diag,ii,ic,ir,oidx; 963 MatScalar *aa=a->a,*v; 964 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 965 PetscScalar *x,*b,*t; 966 967 PetscFunctionBegin; 968 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 969 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 970 t = a->solve_work; 971 972 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 973 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 974 975 /* copy the b into temp work space according to permutation */ 976 ii = 0; 977 for (i=0; i<n; i++) { 978 ic = 6*c[i]; 979 t[ii] = b[ic]; 980 t[ii+1] = b[ic+1]; 981 t[ii+2] = b[ic+2]; 982 t[ii+3] = b[ic+3]; 983 t[ii+4] = b[ic+4]; 984 t[ii+5] = b[ic+5]; 985 ii += 6; 986 } 987 988 /* forward solve the U^T */ 989 idx = 0; 990 for (i=0; i<n; i++) { 991 992 v = aa + 36*diag[i]; 993 /* multiply by the inverse of the block diagonal */ 994 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 995 x6 = t[5+idx]; 996 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 997 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 998 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 999 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 1000 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 1001 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 1002 v += 36; 1003 1004 vi = aj + diag[i] + 1; 1005 nz = ai[i+1] - diag[i] - 1; 1006 while (nz--) { 1007 oidx = 6*(*vi++); 1008 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1009 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1010 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1011 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1012 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1013 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1014 v += 36; 1015 } 1016 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1017 t[5+idx] = s6; 1018 idx += 6; 1019 } 1020 /* backward solve the L^T */ 1021 for (i=n-1; i>=0; i--){ 1022 v = aa + 36*diag[i] - 36; 1023 vi = aj + diag[i] - 1; 1024 nz = diag[i] - ai[i]; 1025 idt = 6*i; 1026 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1027 s6 = t[5+idt]; 1028 while (nz--) { 1029 idx = 6*(*vi--); 1030 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 1031 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 1032 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 1033 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 1034 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 1035 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 1036 v -= 36; 1037 } 1038 } 1039 1040 /* copy t into x according to permutation */ 1041 ii = 0; 1042 for (i=0; i<n; i++) { 1043 ir = 6*r[i]; 1044 x[ir] = t[ii]; 1045 x[ir+1] = t[ii+1]; 1046 x[ir+2] = t[ii+2]; 1047 x[ir+3] = t[ii+3]; 1048 x[ir+4] = t[ii+4]; 1049 x[ir+5] = t[ii+5]; 1050 ii += 6; 1051 } 1052 1053 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1054 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1055 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1056 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1057 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1058 PetscFunctionReturn(0); 1059 } 1060 1061 #undef __FUNCT__ 1062 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 1063 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1064 { 1065 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1066 IS iscol=a->col,isrow=a->row; 1067 PetscErrorCode ierr; 1068 const PetscInt *r,*c,*rout,*cout; 1069 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1070 PetscInt *diag = a->diag,ii,ic,ir,oidx; 1071 MatScalar *aa=a->a,*v; 1072 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1073 PetscScalar *x,*b,*t; 1074 1075 PetscFunctionBegin; 1076 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1077 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1078 t = a->solve_work; 1079 1080 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1081 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1082 1083 /* copy the b into temp work space according to permutation */ 1084 ii = 0; 1085 for (i=0; i<n; i++) { 1086 ic = 7*c[i]; 1087 t[ii] = b[ic]; 1088 t[ii+1] = b[ic+1]; 1089 t[ii+2] = b[ic+2]; 1090 t[ii+3] = b[ic+3]; 1091 t[ii+4] = b[ic+4]; 1092 t[ii+5] = b[ic+5]; 1093 t[ii+6] = b[ic+6]; 1094 ii += 7; 1095 } 1096 1097 /* forward solve the U^T */ 1098 idx = 0; 1099 for (i=0; i<n; i++) { 1100 1101 v = aa + 49*diag[i]; 1102 /* multiply by the inverse of the block diagonal */ 1103 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1104 x6 = t[5+idx]; x7 = t[6+idx]; 1105 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1106 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1107 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1108 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1109 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1110 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1111 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1112 v += 49; 1113 1114 vi = aj + diag[i] + 1; 1115 nz = ai[i+1] - diag[i] - 1; 1116 while (nz--) { 1117 oidx = 7*(*vi++); 1118 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1119 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1120 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1121 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1122 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1123 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1124 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1125 v += 49; 1126 } 1127 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1128 t[5+idx] = s6;t[6+idx] = s7; 1129 idx += 7; 1130 } 1131 /* backward solve the L^T */ 1132 for (i=n-1; i>=0; i--){ 1133 v = aa + 49*diag[i] - 49; 1134 vi = aj + diag[i] - 1; 1135 nz = diag[i] - ai[i]; 1136 idt = 7*i; 1137 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1138 s6 = t[5+idt];s7 = t[6+idt]; 1139 while (nz--) { 1140 idx = 7*(*vi--); 1141 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1142 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1143 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1144 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1145 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1146 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1147 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1148 v -= 49; 1149 } 1150 } 1151 1152 /* copy t into x according to permutation */ 1153 ii = 0; 1154 for (i=0; i<n; i++) { 1155 ir = 7*r[i]; 1156 x[ir] = t[ii]; 1157 x[ir+1] = t[ii+1]; 1158 x[ir+2] = t[ii+2]; 1159 x[ir+3] = t[ii+3]; 1160 x[ir+4] = t[ii+4]; 1161 x[ir+5] = t[ii+5]; 1162 x[ir+6] = t[ii+6]; 1163 ii += 7; 1164 } 1165 1166 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1167 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1168 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1170 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1171 PetscFunctionReturn(0); 1172 } 1173 1174 /* ----------------------------------------------------------- */ 1175 #undef __FUNCT__ 1176 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1177 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1178 { 1179 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1180 IS iscol=a->col,isrow=a->row; 1181 PetscErrorCode ierr; 1182 const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi; 1183 PetscInt i,n=a->mbs; 1184 PetscInt nz,bs=A->rmap->bs,bs2=a->bs2; 1185 MatScalar *aa=a->a,*v; 1186 PetscScalar *x,*b,*s,*t,*ls; 1187 1188 PetscFunctionBegin; 1189 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1190 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1191 t = a->solve_work; 1192 1193 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1194 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1195 1196 /* forward solve the lower triangular */ 1197 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1198 for (i=1; i<n; i++) { 1199 v = aa + bs2*ai[i]; 1200 vi = aj + ai[i]; 1201 nz = a->diag[i] - ai[i]; 1202 s = t + bs*i; 1203 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1204 while (nz--) { 1205 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1206 v += bs2; 1207 } 1208 } 1209 /* backward solve the upper triangular */ 1210 ls = a->solve_work + A->cmap->n; 1211 for (i=n-1; i>=0; i--){ 1212 v = aa + bs2*(a->diag[i] + 1); 1213 vi = aj + a->diag[i] + 1; 1214 nz = ai[i+1] - a->diag[i] - 1; 1215 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1216 while (nz--) { 1217 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1218 v += bs2; 1219 } 1220 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1221 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1222 } 1223 1224 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1225 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1226 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1227 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1228 ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr); 1229 PetscFunctionReturn(0); 1230 } 1231 1232 #undef __FUNCT__ 1233 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1234 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1235 { 1236 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1237 IS iscol=a->col,isrow=a->row; 1238 PetscErrorCode ierr; 1239 const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi; 1240 PetscInt i,n=a->mbs,nz,idx,idt,idc; 1241 MatScalar *aa=a->a,*v; 1242 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1243 PetscScalar *x,*b,*t; 1244 1245 PetscFunctionBegin; 1246 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1247 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1248 t = a->solve_work; 1249 1250 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1251 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1252 1253 /* forward solve the lower triangular */ 1254 idx = 7*(*r++); 1255 t[0] = b[idx]; t[1] = b[1+idx]; 1256 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1257 t[5] = b[5+idx]; t[6] = b[6+idx]; 1258 1259 for (i=1; i<n; i++) { 1260 v = aa + 49*ai[i]; 1261 vi = aj + ai[i]; 1262 nz = diag[i] - ai[i]; 1263 idx = 7*(*r++); 1264 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1265 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1266 while (nz--) { 1267 idx = 7*(*vi++); 1268 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1269 x4 = t[3+idx];x5 = t[4+idx]; 1270 x6 = t[5+idx];x7 = t[6+idx]; 1271 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1272 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1273 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1274 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1275 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1276 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1277 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1278 v += 49; 1279 } 1280 idx = 7*i; 1281 t[idx] = s1;t[1+idx] = s2; 1282 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1283 t[5+idx] = s6;t[6+idx] = s7; 1284 } 1285 /* backward solve the upper triangular */ 1286 for (i=n-1; i>=0; i--){ 1287 v = aa + 49*diag[i] + 49; 1288 vi = aj + diag[i] + 1; 1289 nz = ai[i+1] - diag[i] - 1; 1290 idt = 7*i; 1291 s1 = t[idt]; s2 = t[1+idt]; 1292 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1293 s6 = t[5+idt];s7 = t[6+idt]; 1294 while (nz--) { 1295 idx = 7*(*vi++); 1296 x1 = t[idx]; x2 = t[1+idx]; 1297 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1298 x6 = t[5+idx]; x7 = t[6+idx]; 1299 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1300 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1301 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1302 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1303 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1304 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1305 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1306 v += 49; 1307 } 1308 idc = 7*(*c--); 1309 v = aa + 49*diag[i]; 1310 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1311 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1312 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1313 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1314 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1315 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1316 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1317 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1318 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1319 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1320 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1321 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1322 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1323 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1324 } 1325 1326 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1327 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1328 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1329 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1330 ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr); 1331 PetscFunctionReturn(0); 1332 } 1333 1334 #undef __FUNCT__ 1335 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1336 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1337 { 1338 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1339 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1340 PetscErrorCode ierr; 1341 PetscInt *diag = a->diag,jdx; 1342 const MatScalar *aa=a->a,*v; 1343 PetscScalar *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1344 const PetscScalar *b; 1345 1346 PetscFunctionBegin; 1347 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1348 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1349 /* forward solve the lower triangular */ 1350 idx = 0; 1351 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1352 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1353 x[6] = b[6+idx]; 1354 for (i=1; i<n; i++) { 1355 v = aa + 49*ai[i]; 1356 vi = aj + ai[i]; 1357 nz = diag[i] - ai[i]; 1358 idx = 7*i; 1359 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1360 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1361 s7 = b[6+idx]; 1362 while (nz--) { 1363 jdx = 7*(*vi++); 1364 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1365 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1366 x7 = x[6+jdx]; 1367 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1368 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1369 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1370 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1371 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1372 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1373 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1374 v += 49; 1375 } 1376 x[idx] = s1; 1377 x[1+idx] = s2; 1378 x[2+idx] = s3; 1379 x[3+idx] = s4; 1380 x[4+idx] = s5; 1381 x[5+idx] = s6; 1382 x[6+idx] = s7; 1383 } 1384 /* backward solve the upper triangular */ 1385 for (i=n-1; i>=0; i--){ 1386 v = aa + 49*diag[i] + 49; 1387 vi = aj + diag[i] + 1; 1388 nz = ai[i+1] - diag[i] - 1; 1389 idt = 7*i; 1390 s1 = x[idt]; s2 = x[1+idt]; 1391 s3 = x[2+idt]; s4 = x[3+idt]; 1392 s5 = x[4+idt]; s6 = x[5+idt]; 1393 s7 = x[6+idt]; 1394 while (nz--) { 1395 idx = 7*(*vi++); 1396 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1397 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1398 x7 = x[6+idx]; 1399 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1400 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1401 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1402 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1403 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1404 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1405 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1406 v += 49; 1407 } 1408 v = aa + 49*diag[i]; 1409 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1410 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1411 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1412 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1413 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1414 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1415 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1416 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1417 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1418 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1419 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1420 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1421 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1422 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1423 } 1424 1425 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1426 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1427 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1428 PetscFunctionReturn(0); 1429 } 1430 1431 #undef __FUNCT__ 1432 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1433 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1434 { 1435 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1436 IS iscol=a->col,isrow=a->row; 1437 PetscErrorCode ierr; 1438 const PetscInt *r,*c,*rout,*cout; 1439 PetscInt *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1440 const MatScalar *aa=a->a,*v; 1441 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1442 const PetscScalar *b; 1443 PetscFunctionBegin; 1444 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1445 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1446 t = a->solve_work; 1447 1448 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1449 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1450 1451 /* forward solve the lower triangular */ 1452 idx = 6*(*r++); 1453 t[0] = b[idx]; t[1] = b[1+idx]; 1454 t[2] = b[2+idx]; t[3] = b[3+idx]; 1455 t[4] = b[4+idx]; t[5] = b[5+idx]; 1456 for (i=1; i<n; i++) { 1457 v = aa + 36*ai[i]; 1458 vi = aj + ai[i]; 1459 nz = diag[i] - ai[i]; 1460 idx = 6*(*r++); 1461 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1462 s5 = b[4+idx]; s6 = b[5+idx]; 1463 while (nz--) { 1464 idx = 6*(*vi++); 1465 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1466 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1467 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1468 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1469 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1470 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1471 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1472 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1473 v += 36; 1474 } 1475 idx = 6*i; 1476 t[idx] = s1;t[1+idx] = s2; 1477 t[2+idx] = s3;t[3+idx] = s4; 1478 t[4+idx] = s5;t[5+idx] = s6; 1479 } 1480 /* backward solve the upper triangular */ 1481 for (i=n-1; i>=0; i--){ 1482 v = aa + 36*diag[i] + 36; 1483 vi = aj + diag[i] + 1; 1484 nz = ai[i+1] - diag[i] - 1; 1485 idt = 6*i; 1486 s1 = t[idt]; s2 = t[1+idt]; 1487 s3 = t[2+idt];s4 = t[3+idt]; 1488 s5 = t[4+idt];s6 = t[5+idt]; 1489 while (nz--) { 1490 idx = 6*(*vi++); 1491 x1 = t[idx]; x2 = t[1+idx]; 1492 x3 = t[2+idx]; x4 = t[3+idx]; 1493 x5 = t[4+idx]; x6 = t[5+idx]; 1494 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1495 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1496 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1497 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1498 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1499 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1500 v += 36; 1501 } 1502 idc = 6*(*c--); 1503 v = aa + 36*diag[i]; 1504 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1505 v[18]*s4+v[24]*s5+v[30]*s6; 1506 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1507 v[19]*s4+v[25]*s5+v[31]*s6; 1508 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1509 v[20]*s4+v[26]*s5+v[32]*s6; 1510 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1511 v[21]*s4+v[27]*s5+v[33]*s6; 1512 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1513 v[22]*s4+v[28]*s5+v[34]*s6; 1514 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1515 v[23]*s4+v[29]*s5+v[35]*s6; 1516 } 1517 1518 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1519 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1520 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1521 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1522 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1523 PetscFunctionReturn(0); 1524 } 1525 1526 #undef __FUNCT__ 1527 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1528 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1529 { 1530 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1531 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1532 PetscErrorCode ierr; 1533 PetscInt *diag = a->diag,jdx; 1534 const MatScalar *aa=a->a,*v; 1535 PetscScalar *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1536 const PetscScalar *b; 1537 1538 PetscFunctionBegin; 1539 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1540 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1541 /* forward solve the lower triangular */ 1542 idx = 0; 1543 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1544 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1545 for (i=1; i<n; i++) { 1546 v = aa + 36*ai[i]; 1547 vi = aj + ai[i]; 1548 nz = diag[i] - ai[i]; 1549 idx = 6*i; 1550 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1551 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1552 while (nz--) { 1553 jdx = 6*(*vi++); 1554 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1555 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1556 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1557 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1558 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1559 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1560 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1561 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1562 v += 36; 1563 } 1564 x[idx] = s1; 1565 x[1+idx] = s2; 1566 x[2+idx] = s3; 1567 x[3+idx] = s4; 1568 x[4+idx] = s5; 1569 x[5+idx] = s6; 1570 } 1571 /* backward solve the upper triangular */ 1572 for (i=n-1; i>=0; i--){ 1573 v = aa + 36*diag[i] + 36; 1574 vi = aj + diag[i] + 1; 1575 nz = ai[i+1] - diag[i] - 1; 1576 idt = 6*i; 1577 s1 = x[idt]; s2 = x[1+idt]; 1578 s3 = x[2+idt]; s4 = x[3+idt]; 1579 s5 = x[4+idt]; s6 = x[5+idt]; 1580 while (nz--) { 1581 idx = 6*(*vi++); 1582 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1583 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1584 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1585 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1586 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1587 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1588 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1589 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1590 v += 36; 1591 } 1592 v = aa + 36*diag[i]; 1593 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1594 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1595 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1596 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1597 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1598 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1599 } 1600 1601 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1602 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1603 ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr); 1604 PetscFunctionReturn(0); 1605 } 1606 1607 #undef __FUNCT__ 1608 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1609 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1610 { 1611 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1612 IS iscol=a->col,isrow=a->row; 1613 PetscErrorCode ierr; 1614 const PetscInt *r,*c,*rout,*cout,*diag = a->diag; 1615 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1616 const MatScalar *aa=a->a,*v; 1617 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1618 const PetscScalar *b; 1619 1620 PetscFunctionBegin; 1621 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1622 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1623 t = a->solve_work; 1624 1625 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1626 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1627 1628 /* forward solve the lower triangular */ 1629 idx = 5*(*r++); 1630 t[0] = b[idx]; t[1] = b[1+idx]; 1631 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1632 for (i=1; i<n; i++) { 1633 v = aa + 25*ai[i]; 1634 vi = aj + ai[i]; 1635 nz = diag[i] - ai[i]; 1636 idx = 5*(*r++); 1637 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1638 s5 = b[4+idx]; 1639 while (nz--) { 1640 idx = 5*(*vi++); 1641 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1642 x4 = t[3+idx];x5 = t[4+idx]; 1643 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1644 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1645 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1646 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1647 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1648 v += 25; 1649 } 1650 idx = 5*i; 1651 t[idx] = s1;t[1+idx] = s2; 1652 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1653 } 1654 /* backward solve the upper triangular */ 1655 for (i=n-1; i>=0; i--){ 1656 v = aa + 25*diag[i] + 25; 1657 vi = aj + diag[i] + 1; 1658 nz = ai[i+1] - diag[i] - 1; 1659 idt = 5*i; 1660 s1 = t[idt]; s2 = t[1+idt]; 1661 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1662 while (nz--) { 1663 idx = 5*(*vi++); 1664 x1 = t[idx]; x2 = t[1+idx]; 1665 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1666 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1667 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1668 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1669 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1670 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1671 v += 25; 1672 } 1673 idc = 5*(*c--); 1674 v = aa + 25*diag[i]; 1675 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1676 v[15]*s4+v[20]*s5; 1677 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1678 v[16]*s4+v[21]*s5; 1679 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1680 v[17]*s4+v[22]*s5; 1681 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1682 v[18]*s4+v[23]*s5; 1683 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1684 v[19]*s4+v[24]*s5; 1685 } 1686 1687 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1688 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1689 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1690 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1691 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1692 PetscFunctionReturn(0); 1693 } 1694 1695 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx) 1696 { 1697 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1698 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1699 PetscErrorCode ierr; 1700 PetscInt jdx; 1701 const MatScalar *aa=a->a,*v; 1702 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1703 const PetscScalar *b; 1704 1705 PetscFunctionBegin; 1706 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1707 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1708 /* forward solve the lower triangular */ 1709 idx = 0; 1710 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1711 for (i=1; i<n; i++) { 1712 v = aa + 25*ai[i]; 1713 vi = aj + ai[i]; 1714 nz = ai[i+1] - ai[i]; 1715 idx = 5*i; 1716 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1717 while (nz--) { 1718 jdx = 5*(*vi++); 1719 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1720 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1721 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1722 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1723 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1724 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1725 v += 25; 1726 } 1727 x[idx] = s1; 1728 x[1+idx] = s2; 1729 x[2+idx] = s3; 1730 x[3+idx] = s4; 1731 x[4+idx] = s5; 1732 } 1733 1734 /* backward solve the upper triangular */ 1735 for (i=n-1; i>=0; i--){ 1736 v = aa + 25*ai[2*n-i]; 1737 vi = aj + ai[2*n-i]; 1738 nz = ai[2*n-i +1] - ai[2*n-i]-1; 1739 idt = 5*i; 1740 s1 = x[idt]; s2 = x[1+idt]; 1741 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1742 while (nz--) { 1743 idx = 5*(*vi++); 1744 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1745 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1746 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1747 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1748 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1749 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1750 v += 25; 1751 } 1752 /* x = inv_diagonal*x */ 1753 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1754 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1755 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1756 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1757 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1758 } 1759 1760 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1761 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1762 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1763 PetscFunctionReturn(0); 1764 } 1765 1766 #undef __FUNCT__ 1767 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1768 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1769 { 1770 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1771 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1772 PetscErrorCode ierr; 1773 PetscInt *diag = a->diag,jdx; 1774 const MatScalar *aa=a->a,*v; 1775 PetscScalar *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1776 const PetscScalar *b; 1777 1778 PetscFunctionBegin; 1779 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1780 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1781 /* forward solve the lower triangular */ 1782 idx = 0; 1783 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1784 for (i=1; i<n; i++) { 1785 v = aa + 25*ai[i]; 1786 vi = aj + ai[i]; 1787 nz = diag[i] - ai[i]; 1788 idx = 5*i; 1789 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1790 while (nz--) { 1791 jdx = 5*(*vi++); 1792 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1793 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1794 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1795 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1796 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1797 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1798 v += 25; 1799 } 1800 x[idx] = s1; 1801 x[1+idx] = s2; 1802 x[2+idx] = s3; 1803 x[3+idx] = s4; 1804 x[4+idx] = s5; 1805 } 1806 /* backward solve the upper triangular */ 1807 for (i=n-1; i>=0; i--){ 1808 v = aa + 25*diag[i] + 25; 1809 vi = aj + diag[i] + 1; 1810 nz = ai[i+1] - diag[i] - 1; 1811 idt = 5*i; 1812 s1 = x[idt]; s2 = x[1+idt]; 1813 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1814 while (nz--) { 1815 idx = 5*(*vi++); 1816 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1817 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1818 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1819 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1820 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1821 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1822 v += 25; 1823 } 1824 v = aa + 25*diag[i]; 1825 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1826 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1827 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1828 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1829 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1830 } 1831 1832 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1833 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1834 ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr); 1835 PetscFunctionReturn(0); 1836 } 1837 1838 #undef __FUNCT__ 1839 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1840 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1841 { 1842 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1843 IS iscol=a->col,isrow=a->row; 1844 PetscErrorCode ierr; 1845 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1846 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1847 const MatScalar *aa=a->a,*v; 1848 PetscScalar *x,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1849 const PetscScalar *b; 1850 1851 PetscFunctionBegin; 1852 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1853 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1854 t = a->solve_work; 1855 1856 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1857 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1858 1859 /* forward solve the lower triangular */ 1860 idx = 4*(*r++); 1861 t[0] = b[idx]; t[1] = b[1+idx]; 1862 t[2] = b[2+idx]; t[3] = b[3+idx]; 1863 for (i=1; i<n; i++) { 1864 v = aa + 16*ai[i]; 1865 vi = aj + ai[i]; 1866 nz = diag[i] - ai[i]; 1867 idx = 4*(*r++); 1868 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1869 while (nz--) { 1870 idx = 4*(*vi++); 1871 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1872 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1873 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1874 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1875 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1876 v += 16; 1877 } 1878 idx = 4*i; 1879 t[idx] = s1;t[1+idx] = s2; 1880 t[2+idx] = s3;t[3+idx] = s4; 1881 } 1882 /* backward solve the upper triangular */ 1883 for (i=n-1; i>=0; i--){ 1884 v = aa + 16*diag[i] + 16; 1885 vi = aj + diag[i] + 1; 1886 nz = ai[i+1] - diag[i] - 1; 1887 idt = 4*i; 1888 s1 = t[idt]; s2 = t[1+idt]; 1889 s3 = t[2+idt];s4 = t[3+idt]; 1890 while (nz--) { 1891 idx = 4*(*vi++); 1892 x1 = t[idx]; x2 = t[1+idx]; 1893 x3 = t[2+idx]; x4 = t[3+idx]; 1894 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1895 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1896 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1897 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1898 v += 16; 1899 } 1900 idc = 4*(*c--); 1901 v = aa + 16*diag[i]; 1902 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1903 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1904 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1905 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1906 } 1907 1908 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1909 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1910 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1911 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1912 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 1913 PetscFunctionReturn(0); 1914 } 1915 1916 #undef __FUNCT__ 1917 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1918 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1919 { 1920 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1921 IS iscol=a->col,isrow=a->row; 1922 PetscErrorCode ierr; 1923 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 1924 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 1925 const MatScalar *aa=a->a,*v; 1926 MatScalar s1,s2,s3,s4,x1,x2,x3,x4,*t; 1927 PetscScalar *x; 1928 const PetscScalar *b; 1929 1930 PetscFunctionBegin; 1931 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 1932 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1933 t = (MatScalar *)a->solve_work; 1934 1935 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1936 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1937 1938 /* forward solve the lower triangular */ 1939 idx = 4*(*r++); 1940 t[0] = (MatScalar)b[idx]; 1941 t[1] = (MatScalar)b[1+idx]; 1942 t[2] = (MatScalar)b[2+idx]; 1943 t[3] = (MatScalar)b[3+idx]; 1944 for (i=1; i<n; i++) { 1945 v = aa + 16*ai[i]; 1946 vi = aj + ai[i]; 1947 nz = diag[i] - ai[i]; 1948 idx = 4*(*r++); 1949 s1 = (MatScalar)b[idx]; 1950 s2 = (MatScalar)b[1+idx]; 1951 s3 = (MatScalar)b[2+idx]; 1952 s4 = (MatScalar)b[3+idx]; 1953 while (nz--) { 1954 idx = 4*(*vi++); 1955 x1 = t[idx]; 1956 x2 = t[1+idx]; 1957 x3 = t[2+idx]; 1958 x4 = t[3+idx]; 1959 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1960 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1961 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1962 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1963 v += 16; 1964 } 1965 idx = 4*i; 1966 t[idx] = s1; 1967 t[1+idx] = s2; 1968 t[2+idx] = s3; 1969 t[3+idx] = s4; 1970 } 1971 /* backward solve the upper triangular */ 1972 for (i=n-1; i>=0; i--){ 1973 v = aa + 16*diag[i] + 16; 1974 vi = aj + diag[i] + 1; 1975 nz = ai[i+1] - diag[i] - 1; 1976 idt = 4*i; 1977 s1 = t[idt]; 1978 s2 = t[1+idt]; 1979 s3 = t[2+idt]; 1980 s4 = t[3+idt]; 1981 while (nz--) { 1982 idx = 4*(*vi++); 1983 x1 = t[idx]; 1984 x2 = t[1+idx]; 1985 x3 = t[2+idx]; 1986 x4 = t[3+idx]; 1987 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1988 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1989 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1990 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1991 v += 16; 1992 } 1993 idc = 4*(*c--); 1994 v = aa + 16*diag[i]; 1995 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1996 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1997 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1998 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1999 x[idc] = (PetscScalar)t[idt]; 2000 x[1+idc] = (PetscScalar)t[1+idt]; 2001 x[2+idc] = (PetscScalar)t[2+idt]; 2002 x[3+idc] = (PetscScalar)t[3+idt]; 2003 } 2004 2005 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2006 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2007 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2008 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2009 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2010 PetscFunctionReturn(0); 2011 } 2012 2013 #if defined (PETSC_HAVE_SSE) 2014 2015 #include PETSC_HAVE_SSE 2016 2017 #undef __FUNCT__ 2018 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 2019 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 2020 { 2021 /* 2022 Note: This code uses demotion of double 2023 to float when performing the mixed-mode computation. 2024 This may not be numerically reasonable for all applications. 2025 */ 2026 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2027 IS iscol=a->col,isrow=a->row; 2028 PetscErrorCode ierr; 2029 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16; 2030 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2031 MatScalar *aa=a->a,*v; 2032 PetscScalar *x,*b,*t; 2033 2034 /* Make space in temp stack for 16 Byte Aligned arrays */ 2035 float ssealignedspace[11],*tmps,*tmpx; 2036 unsigned long offset; 2037 2038 PetscFunctionBegin; 2039 SSE_SCOPE_BEGIN; 2040 2041 offset = (unsigned long)ssealignedspace % 16; 2042 if (offset) offset = (16 - offset)/4; 2043 tmps = &ssealignedspace[offset]; 2044 tmpx = &ssealignedspace[offset+4]; 2045 PREFETCH_NTA(aa+16*ai[1]); 2046 2047 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2048 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2049 t = a->solve_work; 2050 2051 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2052 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2053 2054 /* forward solve the lower triangular */ 2055 idx = 4*(*r++); 2056 t[0] = b[idx]; t[1] = b[1+idx]; 2057 t[2] = b[2+idx]; t[3] = b[3+idx]; 2058 v = aa + 16*ai[1]; 2059 2060 for (i=1; i<n;) { 2061 PREFETCH_NTA(&v[8]); 2062 vi = aj + ai[i]; 2063 nz = diag[i] - ai[i]; 2064 idx = 4*(*r++); 2065 2066 /* Demote sum from double to float */ 2067 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 2068 LOAD_PS(tmps,XMM7); 2069 2070 while (nz--) { 2071 PREFETCH_NTA(&v[16]); 2072 idx = 4*(*vi++); 2073 2074 /* Demote solution (so far) from double to float */ 2075 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 2076 2077 /* 4x4 Matrix-Vector product with negative accumulation: */ 2078 SSE_INLINE_BEGIN_2(tmpx,v) 2079 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2080 2081 /* First Column */ 2082 SSE_COPY_PS(XMM0,XMM6) 2083 SSE_SHUFFLE(XMM0,XMM0,0x00) 2084 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2085 SSE_SUB_PS(XMM7,XMM0) 2086 2087 /* Second Column */ 2088 SSE_COPY_PS(XMM1,XMM6) 2089 SSE_SHUFFLE(XMM1,XMM1,0x55) 2090 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2091 SSE_SUB_PS(XMM7,XMM1) 2092 2093 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2094 2095 /* Third Column */ 2096 SSE_COPY_PS(XMM2,XMM6) 2097 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2098 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2099 SSE_SUB_PS(XMM7,XMM2) 2100 2101 /* Fourth Column */ 2102 SSE_COPY_PS(XMM3,XMM6) 2103 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2104 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2105 SSE_SUB_PS(XMM7,XMM3) 2106 SSE_INLINE_END_2 2107 2108 v += 16; 2109 } 2110 idx = 4*i; 2111 v = aa + 16*ai[++i]; 2112 PREFETCH_NTA(v); 2113 STORE_PS(tmps,XMM7); 2114 2115 /* Promote result from float to double */ 2116 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 2117 } 2118 /* backward solve the upper triangular */ 2119 idt = 4*(n-1); 2120 ai16 = 16*diag[n-1]; 2121 v = aa + ai16 + 16; 2122 for (i=n-1; i>=0;){ 2123 PREFETCH_NTA(&v[8]); 2124 vi = aj + diag[i] + 1; 2125 nz = ai[i+1] - diag[i] - 1; 2126 2127 /* Demote accumulator from double to float */ 2128 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 2129 LOAD_PS(tmps,XMM7); 2130 2131 while (nz--) { 2132 PREFETCH_NTA(&v[16]); 2133 idx = 4*(*vi++); 2134 2135 /* Demote solution (so far) from double to float */ 2136 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 2137 2138 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2139 SSE_INLINE_BEGIN_2(tmpx,v) 2140 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2141 2142 /* First Column */ 2143 SSE_COPY_PS(XMM0,XMM6) 2144 SSE_SHUFFLE(XMM0,XMM0,0x00) 2145 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2146 SSE_SUB_PS(XMM7,XMM0) 2147 2148 /* Second Column */ 2149 SSE_COPY_PS(XMM1,XMM6) 2150 SSE_SHUFFLE(XMM1,XMM1,0x55) 2151 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2152 SSE_SUB_PS(XMM7,XMM1) 2153 2154 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2155 2156 /* Third Column */ 2157 SSE_COPY_PS(XMM2,XMM6) 2158 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2159 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2160 SSE_SUB_PS(XMM7,XMM2) 2161 2162 /* Fourth Column */ 2163 SSE_COPY_PS(XMM3,XMM6) 2164 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2165 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2166 SSE_SUB_PS(XMM7,XMM3) 2167 SSE_INLINE_END_2 2168 v += 16; 2169 } 2170 v = aa + ai16; 2171 ai16 = 16*diag[--i]; 2172 PREFETCH_NTA(aa+ai16+16); 2173 /* 2174 Scale the result by the diagonal 4x4 block, 2175 which was inverted as part of the factorization 2176 */ 2177 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2178 /* First Column */ 2179 SSE_COPY_PS(XMM0,XMM7) 2180 SSE_SHUFFLE(XMM0,XMM0,0x00) 2181 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2182 2183 /* Second Column */ 2184 SSE_COPY_PS(XMM1,XMM7) 2185 SSE_SHUFFLE(XMM1,XMM1,0x55) 2186 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2187 SSE_ADD_PS(XMM0,XMM1) 2188 2189 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2190 2191 /* Third Column */ 2192 SSE_COPY_PS(XMM2,XMM7) 2193 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2194 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2195 SSE_ADD_PS(XMM0,XMM2) 2196 2197 /* Fourth Column */ 2198 SSE_COPY_PS(XMM3,XMM7) 2199 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2200 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2201 SSE_ADD_PS(XMM0,XMM3) 2202 2203 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2204 SSE_INLINE_END_3 2205 2206 /* Promote solution from float to double */ 2207 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2208 2209 /* Apply reordering to t and stream into x. */ 2210 /* This way, x doesn't pollute the cache. */ 2211 /* Be careful with size: 2 doubles = 4 floats! */ 2212 idc = 4*(*c--); 2213 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2214 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2215 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2216 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2217 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2218 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2219 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2220 SSE_INLINE_END_2 2221 v = aa + ai16 + 16; 2222 idt -= 4; 2223 } 2224 2225 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2226 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2227 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2228 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2229 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2230 SSE_SCOPE_END; 2231 PetscFunctionReturn(0); 2232 } 2233 2234 #endif 2235 2236 2237 /* 2238 Special case where the matrix was ILU(0) factored in the natural 2239 ordering. This eliminates the need for the column and row permutation. 2240 */ 2241 #undef __FUNCT__ 2242 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2243 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2244 { 2245 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2246 PetscInt n=a->mbs; 2247 const PetscInt *ai=a->i,*aj=a->j; 2248 PetscErrorCode ierr; 2249 const PetscInt *diag = a->diag; 2250 const MatScalar *aa=a->a; 2251 PetscScalar *x; 2252 const PetscScalar *b; 2253 2254 PetscFunctionBegin; 2255 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2256 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2257 2258 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2259 { 2260 static PetscScalar w[2000]; /* very BAD need to fix */ 2261 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2262 } 2263 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2264 { 2265 static PetscScalar w[2000]; /* very BAD need to fix */ 2266 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2267 } 2268 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2269 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2270 #else 2271 { 2272 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2273 const MatScalar *v; 2274 PetscInt jdx,idt,idx,nz,i,ai16; 2275 const PetscInt *vi; 2276 2277 /* forward solve the lower triangular */ 2278 idx = 0; 2279 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2280 for (i=1; i<n; i++) { 2281 v = aa + 16*ai[i]; 2282 vi = aj + ai[i]; 2283 nz = diag[i] - ai[i]; 2284 idx += 4; 2285 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2286 while (nz--) { 2287 jdx = 4*(*vi++); 2288 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2289 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2290 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2291 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2292 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2293 v += 16; 2294 } 2295 x[idx] = s1; 2296 x[1+idx] = s2; 2297 x[2+idx] = s3; 2298 x[3+idx] = s4; 2299 } 2300 /* backward solve the upper triangular */ 2301 idt = 4*(n-1); 2302 for (i=n-1; i>=0; i--){ 2303 ai16 = 16*diag[i]; 2304 v = aa + ai16 + 16; 2305 vi = aj + diag[i] + 1; 2306 nz = ai[i+1] - diag[i] - 1; 2307 s1 = x[idt]; s2 = x[1+idt]; 2308 s3 = x[2+idt];s4 = x[3+idt]; 2309 while (nz--) { 2310 idx = 4*(*vi++); 2311 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2312 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2313 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2314 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2315 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2316 v += 16; 2317 } 2318 v = aa + ai16; 2319 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2320 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2321 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2322 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2323 idt -= 4; 2324 } 2325 } 2326 #endif 2327 2328 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2329 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2330 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2331 PetscFunctionReturn(0); 2332 } 2333 2334 #undef __FUNCT__ 2335 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2336 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2337 { 2338 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2339 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2340 PetscErrorCode ierr; 2341 PetscInt *diag = a->diag; 2342 MatScalar *aa=a->a; 2343 PetscScalar *x,*b; 2344 2345 PetscFunctionBegin; 2346 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2347 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2348 2349 { 2350 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2351 MatScalar *v,*t=(MatScalar *)x; 2352 PetscInt jdx,idt,idx,nz,*vi,i,ai16; 2353 2354 /* forward solve the lower triangular */ 2355 idx = 0; 2356 t[0] = (MatScalar)b[0]; 2357 t[1] = (MatScalar)b[1]; 2358 t[2] = (MatScalar)b[2]; 2359 t[3] = (MatScalar)b[3]; 2360 for (i=1; i<n; i++) { 2361 v = aa + 16*ai[i]; 2362 vi = aj + ai[i]; 2363 nz = diag[i] - ai[i]; 2364 idx += 4; 2365 s1 = (MatScalar)b[idx]; 2366 s2 = (MatScalar)b[1+idx]; 2367 s3 = (MatScalar)b[2+idx]; 2368 s4 = (MatScalar)b[3+idx]; 2369 while (nz--) { 2370 jdx = 4*(*vi++); 2371 x1 = t[jdx]; 2372 x2 = t[1+jdx]; 2373 x3 = t[2+jdx]; 2374 x4 = t[3+jdx]; 2375 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2376 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2377 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2378 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2379 v += 16; 2380 } 2381 t[idx] = s1; 2382 t[1+idx] = s2; 2383 t[2+idx] = s3; 2384 t[3+idx] = s4; 2385 } 2386 /* backward solve the upper triangular */ 2387 idt = 4*(n-1); 2388 for (i=n-1; i>=0; i--){ 2389 ai16 = 16*diag[i]; 2390 v = aa + ai16 + 16; 2391 vi = aj + diag[i] + 1; 2392 nz = ai[i+1] - diag[i] - 1; 2393 s1 = t[idt]; 2394 s2 = t[1+idt]; 2395 s3 = t[2+idt]; 2396 s4 = t[3+idt]; 2397 while (nz--) { 2398 idx = 4*(*vi++); 2399 x1 = (MatScalar)x[idx]; 2400 x2 = (MatScalar)x[1+idx]; 2401 x3 = (MatScalar)x[2+idx]; 2402 x4 = (MatScalar)x[3+idx]; 2403 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2404 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2405 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2406 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2407 v += 16; 2408 } 2409 v = aa + ai16; 2410 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2411 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2412 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2413 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2414 idt -= 4; 2415 } 2416 } 2417 2418 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2419 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2420 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2421 PetscFunctionReturn(0); 2422 } 2423 2424 #if defined (PETSC_HAVE_SSE) 2425 2426 #include PETSC_HAVE_SSE 2427 #undef __FUNCT__ 2428 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2429 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2430 { 2431 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2432 unsigned short *aj=(unsigned short *)a->j; 2433 PetscErrorCode ierr; 2434 int *ai=a->i,n=a->mbs,*diag = a->diag; 2435 MatScalar *aa=a->a; 2436 PetscScalar *x,*b; 2437 2438 PetscFunctionBegin; 2439 SSE_SCOPE_BEGIN; 2440 /* 2441 Note: This code currently uses demotion of double 2442 to float when performing the mixed-mode computation. 2443 This may not be numerically reasonable for all applications. 2444 */ 2445 PREFETCH_NTA(aa+16*ai[1]); 2446 2447 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2448 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2449 { 2450 /* x will first be computed in single precision then promoted inplace to double */ 2451 MatScalar *v,*t=(MatScalar *)x; 2452 int nz,i,idt,ai16; 2453 unsigned int jdx,idx; 2454 unsigned short *vi; 2455 /* Forward solve the lower triangular factor. */ 2456 2457 /* First block is the identity. */ 2458 idx = 0; 2459 CONVERT_DOUBLE4_FLOAT4(t,b); 2460 v = aa + 16*((unsigned int)ai[1]); 2461 2462 for (i=1; i<n;) { 2463 PREFETCH_NTA(&v[8]); 2464 vi = aj + ai[i]; 2465 nz = diag[i] - ai[i]; 2466 idx += 4; 2467 2468 /* Demote RHS from double to float. */ 2469 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2470 LOAD_PS(&t[idx],XMM7); 2471 2472 while (nz--) { 2473 PREFETCH_NTA(&v[16]); 2474 jdx = 4*((unsigned int)(*vi++)); 2475 2476 /* 4x4 Matrix-Vector product with negative accumulation: */ 2477 SSE_INLINE_BEGIN_2(&t[jdx],v) 2478 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2479 2480 /* First Column */ 2481 SSE_COPY_PS(XMM0,XMM6) 2482 SSE_SHUFFLE(XMM0,XMM0,0x00) 2483 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2484 SSE_SUB_PS(XMM7,XMM0) 2485 2486 /* Second Column */ 2487 SSE_COPY_PS(XMM1,XMM6) 2488 SSE_SHUFFLE(XMM1,XMM1,0x55) 2489 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2490 SSE_SUB_PS(XMM7,XMM1) 2491 2492 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2493 2494 /* Third Column */ 2495 SSE_COPY_PS(XMM2,XMM6) 2496 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2497 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2498 SSE_SUB_PS(XMM7,XMM2) 2499 2500 /* Fourth Column */ 2501 SSE_COPY_PS(XMM3,XMM6) 2502 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2503 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2504 SSE_SUB_PS(XMM7,XMM3) 2505 SSE_INLINE_END_2 2506 2507 v += 16; 2508 } 2509 v = aa + 16*ai[++i]; 2510 PREFETCH_NTA(v); 2511 STORE_PS(&t[idx],XMM7); 2512 } 2513 2514 /* Backward solve the upper triangular factor.*/ 2515 2516 idt = 4*(n-1); 2517 ai16 = 16*diag[n-1]; 2518 v = aa + ai16 + 16; 2519 for (i=n-1; i>=0;){ 2520 PREFETCH_NTA(&v[8]); 2521 vi = aj + diag[i] + 1; 2522 nz = ai[i+1] - diag[i] - 1; 2523 2524 LOAD_PS(&t[idt],XMM7); 2525 2526 while (nz--) { 2527 PREFETCH_NTA(&v[16]); 2528 idx = 4*((unsigned int)(*vi++)); 2529 2530 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2531 SSE_INLINE_BEGIN_2(&t[idx],v) 2532 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2533 2534 /* First Column */ 2535 SSE_COPY_PS(XMM0,XMM6) 2536 SSE_SHUFFLE(XMM0,XMM0,0x00) 2537 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2538 SSE_SUB_PS(XMM7,XMM0) 2539 2540 /* Second Column */ 2541 SSE_COPY_PS(XMM1,XMM6) 2542 SSE_SHUFFLE(XMM1,XMM1,0x55) 2543 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2544 SSE_SUB_PS(XMM7,XMM1) 2545 2546 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2547 2548 /* Third Column */ 2549 SSE_COPY_PS(XMM2,XMM6) 2550 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2551 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2552 SSE_SUB_PS(XMM7,XMM2) 2553 2554 /* Fourth Column */ 2555 SSE_COPY_PS(XMM3,XMM6) 2556 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2557 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2558 SSE_SUB_PS(XMM7,XMM3) 2559 SSE_INLINE_END_2 2560 v += 16; 2561 } 2562 v = aa + ai16; 2563 ai16 = 16*diag[--i]; 2564 PREFETCH_NTA(aa+ai16+16); 2565 /* 2566 Scale the result by the diagonal 4x4 block, 2567 which was inverted as part of the factorization 2568 */ 2569 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2570 /* First Column */ 2571 SSE_COPY_PS(XMM0,XMM7) 2572 SSE_SHUFFLE(XMM0,XMM0,0x00) 2573 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2574 2575 /* Second Column */ 2576 SSE_COPY_PS(XMM1,XMM7) 2577 SSE_SHUFFLE(XMM1,XMM1,0x55) 2578 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2579 SSE_ADD_PS(XMM0,XMM1) 2580 2581 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2582 2583 /* Third Column */ 2584 SSE_COPY_PS(XMM2,XMM7) 2585 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2586 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2587 SSE_ADD_PS(XMM0,XMM2) 2588 2589 /* Fourth Column */ 2590 SSE_COPY_PS(XMM3,XMM7) 2591 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2592 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2593 SSE_ADD_PS(XMM0,XMM3) 2594 2595 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2596 SSE_INLINE_END_3 2597 2598 v = aa + ai16 + 16; 2599 idt -= 4; 2600 } 2601 2602 /* Convert t from single precision back to double precision (inplace)*/ 2603 idt = 4*(n-1); 2604 for (i=n-1;i>=0;i--) { 2605 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2606 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2607 PetscScalar *xtemp=&x[idt]; 2608 MatScalar *ttemp=&t[idt]; 2609 xtemp[3] = (PetscScalar)ttemp[3]; 2610 xtemp[2] = (PetscScalar)ttemp[2]; 2611 xtemp[1] = (PetscScalar)ttemp[1]; 2612 xtemp[0] = (PetscScalar)ttemp[0]; 2613 idt -= 4; 2614 } 2615 2616 } /* End of artificial scope. */ 2617 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2618 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2619 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2620 SSE_SCOPE_END; 2621 PetscFunctionReturn(0); 2622 } 2623 2624 #undef __FUNCT__ 2625 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2626 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2627 { 2628 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2629 int *aj=a->j; 2630 PetscErrorCode ierr; 2631 int *ai=a->i,n=a->mbs,*diag = a->diag; 2632 MatScalar *aa=a->a; 2633 PetscScalar *x,*b; 2634 2635 PetscFunctionBegin; 2636 SSE_SCOPE_BEGIN; 2637 /* 2638 Note: This code currently uses demotion of double 2639 to float when performing the mixed-mode computation. 2640 This may not be numerically reasonable for all applications. 2641 */ 2642 PREFETCH_NTA(aa+16*ai[1]); 2643 2644 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2645 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2646 { 2647 /* x will first be computed in single precision then promoted inplace to double */ 2648 MatScalar *v,*t=(MatScalar *)x; 2649 int nz,i,idt,ai16; 2650 int jdx,idx; 2651 int *vi; 2652 /* Forward solve the lower triangular factor. */ 2653 2654 /* First block is the identity. */ 2655 idx = 0; 2656 CONVERT_DOUBLE4_FLOAT4(t,b); 2657 v = aa + 16*ai[1]; 2658 2659 for (i=1; i<n;) { 2660 PREFETCH_NTA(&v[8]); 2661 vi = aj + ai[i]; 2662 nz = diag[i] - ai[i]; 2663 idx += 4; 2664 2665 /* Demote RHS from double to float. */ 2666 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2667 LOAD_PS(&t[idx],XMM7); 2668 2669 while (nz--) { 2670 PREFETCH_NTA(&v[16]); 2671 jdx = 4*(*vi++); 2672 /* jdx = *vi++; */ 2673 2674 /* 4x4 Matrix-Vector product with negative accumulation: */ 2675 SSE_INLINE_BEGIN_2(&t[jdx],v) 2676 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2677 2678 /* First Column */ 2679 SSE_COPY_PS(XMM0,XMM6) 2680 SSE_SHUFFLE(XMM0,XMM0,0x00) 2681 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2682 SSE_SUB_PS(XMM7,XMM0) 2683 2684 /* Second Column */ 2685 SSE_COPY_PS(XMM1,XMM6) 2686 SSE_SHUFFLE(XMM1,XMM1,0x55) 2687 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2688 SSE_SUB_PS(XMM7,XMM1) 2689 2690 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2691 2692 /* Third Column */ 2693 SSE_COPY_PS(XMM2,XMM6) 2694 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2695 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2696 SSE_SUB_PS(XMM7,XMM2) 2697 2698 /* Fourth Column */ 2699 SSE_COPY_PS(XMM3,XMM6) 2700 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2701 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2702 SSE_SUB_PS(XMM7,XMM3) 2703 SSE_INLINE_END_2 2704 2705 v += 16; 2706 } 2707 v = aa + 16*ai[++i]; 2708 PREFETCH_NTA(v); 2709 STORE_PS(&t[idx],XMM7); 2710 } 2711 2712 /* Backward solve the upper triangular factor.*/ 2713 2714 idt = 4*(n-1); 2715 ai16 = 16*diag[n-1]; 2716 v = aa + ai16 + 16; 2717 for (i=n-1; i>=0;){ 2718 PREFETCH_NTA(&v[8]); 2719 vi = aj + diag[i] + 1; 2720 nz = ai[i+1] - diag[i] - 1; 2721 2722 LOAD_PS(&t[idt],XMM7); 2723 2724 while (nz--) { 2725 PREFETCH_NTA(&v[16]); 2726 idx = 4*(*vi++); 2727 /* idx = *vi++; */ 2728 2729 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2730 SSE_INLINE_BEGIN_2(&t[idx],v) 2731 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2732 2733 /* First Column */ 2734 SSE_COPY_PS(XMM0,XMM6) 2735 SSE_SHUFFLE(XMM0,XMM0,0x00) 2736 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2737 SSE_SUB_PS(XMM7,XMM0) 2738 2739 /* Second Column */ 2740 SSE_COPY_PS(XMM1,XMM6) 2741 SSE_SHUFFLE(XMM1,XMM1,0x55) 2742 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2743 SSE_SUB_PS(XMM7,XMM1) 2744 2745 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2746 2747 /* Third Column */ 2748 SSE_COPY_PS(XMM2,XMM6) 2749 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2750 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2751 SSE_SUB_PS(XMM7,XMM2) 2752 2753 /* Fourth Column */ 2754 SSE_COPY_PS(XMM3,XMM6) 2755 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2756 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2757 SSE_SUB_PS(XMM7,XMM3) 2758 SSE_INLINE_END_2 2759 v += 16; 2760 } 2761 v = aa + ai16; 2762 ai16 = 16*diag[--i]; 2763 PREFETCH_NTA(aa+ai16+16); 2764 /* 2765 Scale the result by the diagonal 4x4 block, 2766 which was inverted as part of the factorization 2767 */ 2768 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2769 /* First Column */ 2770 SSE_COPY_PS(XMM0,XMM7) 2771 SSE_SHUFFLE(XMM0,XMM0,0x00) 2772 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2773 2774 /* Second Column */ 2775 SSE_COPY_PS(XMM1,XMM7) 2776 SSE_SHUFFLE(XMM1,XMM1,0x55) 2777 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2778 SSE_ADD_PS(XMM0,XMM1) 2779 2780 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2781 2782 /* Third Column */ 2783 SSE_COPY_PS(XMM2,XMM7) 2784 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2785 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2786 SSE_ADD_PS(XMM0,XMM2) 2787 2788 /* Fourth Column */ 2789 SSE_COPY_PS(XMM3,XMM7) 2790 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2791 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2792 SSE_ADD_PS(XMM0,XMM3) 2793 2794 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2795 SSE_INLINE_END_3 2796 2797 v = aa + ai16 + 16; 2798 idt -= 4; 2799 } 2800 2801 /* Convert t from single precision back to double precision (inplace)*/ 2802 idt = 4*(n-1); 2803 for (i=n-1;i>=0;i--) { 2804 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2805 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2806 PetscScalar *xtemp=&x[idt]; 2807 MatScalar *ttemp=&t[idt]; 2808 xtemp[3] = (PetscScalar)ttemp[3]; 2809 xtemp[2] = (PetscScalar)ttemp[2]; 2810 xtemp[1] = (PetscScalar)ttemp[1]; 2811 xtemp[0] = (PetscScalar)ttemp[0]; 2812 idt -= 4; 2813 } 2814 2815 } /* End of artificial scope. */ 2816 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2817 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2818 ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr); 2819 SSE_SCOPE_END; 2820 PetscFunctionReturn(0); 2821 } 2822 2823 #endif 2824 #undef __FUNCT__ 2825 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2826 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2827 { 2828 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2829 IS iscol=a->col,isrow=a->row; 2830 PetscErrorCode ierr; 2831 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2832 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2833 const MatScalar *aa=a->a,*v; 2834 PetscScalar *x,s1,s2,s3,x1,x2,x3,*t; 2835 const PetscScalar *b; 2836 2837 PetscFunctionBegin; 2838 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2839 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2840 t = a->solve_work; 2841 2842 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2843 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2844 2845 /* forward solve the lower triangular */ 2846 idx = 3*(*r++); 2847 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2848 for (i=1; i<n; i++) { 2849 v = aa + 9*ai[i]; 2850 vi = aj + ai[i]; 2851 nz = diag[i] - ai[i]; 2852 idx = 3*(*r++); 2853 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2854 while (nz--) { 2855 idx = 3*(*vi++); 2856 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2857 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2858 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2859 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2860 v += 9; 2861 } 2862 idx = 3*i; 2863 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2864 } 2865 /* backward solve the upper triangular */ 2866 for (i=n-1; i>=0; i--){ 2867 v = aa + 9*diag[i] + 9; 2868 vi = aj + diag[i] + 1; 2869 nz = ai[i+1] - diag[i] - 1; 2870 idt = 3*i; 2871 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2872 while (nz--) { 2873 idx = 3*(*vi++); 2874 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2875 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2876 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2877 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2878 v += 9; 2879 } 2880 idc = 3*(*c--); 2881 v = aa + 9*diag[i]; 2882 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2883 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2884 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2885 } 2886 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2887 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2888 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2889 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2890 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 2891 PetscFunctionReturn(0); 2892 } 2893 2894 /* 2895 Special case where the matrix was ILU(0) factored in the natural 2896 ordering. This eliminates the need for the column and row permutation. 2897 */ 2898 #undef __FUNCT__ 2899 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2900 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2901 { 2902 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2903 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 2904 PetscErrorCode ierr; 2905 PetscInt *diag = a->diag; 2906 const MatScalar *aa=a->a,*v; 2907 PetscScalar *x,s1,s2,s3,x1,x2,x3; 2908 const PetscScalar *b; 2909 PetscInt jdx,idt,idx,nz,*vi,i; 2910 2911 PetscFunctionBegin; 2912 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2913 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2914 2915 /* forward solve the lower triangular */ 2916 idx = 0; 2917 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2918 for (i=1; i<n; i++) { 2919 v = aa + 9*ai[i]; 2920 vi = aj + ai[i]; 2921 nz = diag[i] - ai[i]; 2922 idx += 3; 2923 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2924 while (nz--) { 2925 jdx = 3*(*vi++); 2926 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2927 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2928 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2929 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2930 v += 9; 2931 } 2932 x[idx] = s1; 2933 x[1+idx] = s2; 2934 x[2+idx] = s3; 2935 } 2936 /* backward solve the upper triangular */ 2937 for (i=n-1; i>=0; i--){ 2938 v = aa + 9*diag[i] + 9; 2939 vi = aj + diag[i] + 1; 2940 nz = ai[i+1] - diag[i] - 1; 2941 idt = 3*i; 2942 s1 = x[idt]; s2 = x[1+idt]; 2943 s3 = x[2+idt]; 2944 while (nz--) { 2945 idx = 3*(*vi++); 2946 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2947 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2948 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2949 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2950 v += 9; 2951 } 2952 v = aa + 9*diag[i]; 2953 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2954 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2955 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2956 } 2957 2958 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2959 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2960 ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr); 2961 PetscFunctionReturn(0); 2962 } 2963 2964 #undef __FUNCT__ 2965 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2966 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2967 { 2968 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2969 IS iscol=a->col,isrow=a->row; 2970 PetscErrorCode ierr; 2971 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc; 2972 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 2973 const MatScalar *aa=a->a,*v; 2974 PetscScalar *x,s1,s2,x1,x2,*t; 2975 const PetscScalar *b; 2976 2977 PetscFunctionBegin; 2978 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 2979 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2980 t = a->solve_work; 2981 2982 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2983 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2984 2985 /* forward solve the lower triangular */ 2986 idx = 2*(*r++); 2987 t[0] = b[idx]; t[1] = b[1+idx]; 2988 for (i=1; i<n; i++) { 2989 v = aa + 4*ai[i]; 2990 vi = aj + ai[i]; 2991 nz = diag[i] - ai[i]; 2992 idx = 2*(*r++); 2993 s1 = b[idx]; s2 = b[1+idx]; 2994 while (nz--) { 2995 idx = 2*(*vi++); 2996 x1 = t[idx]; x2 = t[1+idx]; 2997 s1 -= v[0]*x1 + v[2]*x2; 2998 s2 -= v[1]*x1 + v[3]*x2; 2999 v += 4; 3000 } 3001 idx = 2*i; 3002 t[idx] = s1; t[1+idx] = s2; 3003 } 3004 /* backward solve the upper triangular */ 3005 for (i=n-1; i>=0; i--){ 3006 v = aa + 4*diag[i] + 4; 3007 vi = aj + diag[i] + 1; 3008 nz = ai[i+1] - diag[i] - 1; 3009 idt = 2*i; 3010 s1 = t[idt]; s2 = t[1+idt]; 3011 while (nz--) { 3012 idx = 2*(*vi++); 3013 x1 = t[idx]; x2 = t[1+idx]; 3014 s1 -= v[0]*x1 + v[2]*x2; 3015 s2 -= v[1]*x1 + v[3]*x2; 3016 v += 4; 3017 } 3018 idc = 2*(*c--); 3019 v = aa + 4*diag[i]; 3020 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 3021 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 3022 } 3023 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3024 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3025 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3026 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3027 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3028 PetscFunctionReturn(0); 3029 } 3030 3031 /* 3032 Special case where the matrix was ILU(0) factored in the natural 3033 ordering. This eliminates the need for the column and row permutation. 3034 */ 3035 #undef __FUNCT__ 3036 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 3037 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 3038 { 3039 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3040 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3041 PetscErrorCode ierr; 3042 PetscInt *diag = a->diag; 3043 const MatScalar *aa=a->a,*v; 3044 PetscScalar *x,s1,s2,x1,x2; 3045 const PetscScalar *b; 3046 PetscInt jdx,idt,idx,nz,*vi,i; 3047 3048 PetscFunctionBegin; 3049 ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3050 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3051 3052 /* forward solve the lower triangular */ 3053 idx = 0; 3054 x[0] = b[0]; x[1] = b[1]; 3055 for (i=1; i<n; i++) { 3056 v = aa + 4*ai[i]; 3057 vi = aj + ai[i]; 3058 nz = diag[i] - ai[i]; 3059 idx += 2; 3060 s1 = b[idx];s2 = b[1+idx]; 3061 while (nz--) { 3062 jdx = 2*(*vi++); 3063 x1 = x[jdx];x2 = x[1+jdx]; 3064 s1 -= v[0]*x1 + v[2]*x2; 3065 s2 -= v[1]*x1 + v[3]*x2; 3066 v += 4; 3067 } 3068 x[idx] = s1; 3069 x[1+idx] = s2; 3070 } 3071 /* backward solve the upper triangular */ 3072 for (i=n-1; i>=0; i--){ 3073 v = aa + 4*diag[i] + 4; 3074 vi = aj + diag[i] + 1; 3075 nz = ai[i+1] - diag[i] - 1; 3076 idt = 2*i; 3077 s1 = x[idt]; s2 = x[1+idt]; 3078 while (nz--) { 3079 idx = 2*(*vi++); 3080 x1 = x[idx]; x2 = x[1+idx]; 3081 s1 -= v[0]*x1 + v[2]*x2; 3082 s2 -= v[1]*x1 + v[3]*x2; 3083 v += 4; 3084 } 3085 v = aa + 4*diag[i]; 3086 x[idt] = v[0]*s1 + v[2]*s2; 3087 x[1+idt] = v[1]*s1 + v[3]*s2; 3088 } 3089 3090 ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr); 3091 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3092 ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr); 3093 PetscFunctionReturn(0); 3094 } 3095 3096 #undef __FUNCT__ 3097 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 3098 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 3099 { 3100 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 3101 IS iscol=a->col,isrow=a->row; 3102 PetscErrorCode ierr; 3103 PetscInt i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 3104 const PetscInt *r,*c,*diag = a->diag,*rout,*cout; 3105 MatScalar *aa=a->a,*v; 3106 PetscScalar *x,*b,s1,*t; 3107 3108 PetscFunctionBegin; 3109 if (!n) PetscFunctionReturn(0); 3110 3111 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3112 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3113 t = a->solve_work; 3114 3115 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 3116 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 3117 3118 /* forward solve the lower triangular */ 3119 t[0] = b[*r++]; 3120 for (i=1; i<n; i++) { 3121 v = aa + ai[i]; 3122 vi = aj + ai[i]; 3123 nz = diag[i] - ai[i]; 3124 s1 = b[*r++]; 3125 while (nz--) { 3126 s1 -= (*v++)*t[*vi++]; 3127 } 3128 t[i] = s1; 3129 } 3130 /* backward solve the upper triangular */ 3131 for (i=n-1; i>=0; i--){ 3132 v = aa + diag[i] + 1; 3133 vi = aj + diag[i] + 1; 3134 nz = ai[i+1] - diag[i] - 1; 3135 s1 = t[i]; 3136 while (nz--) { 3137 s1 -= (*v++)*t[*vi++]; 3138 } 3139 x[*c--] = t[i] = aa[diag[i]]*s1; 3140 } 3141 3142 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 3143 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 3144 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3145 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3146 ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3147 PetscFunctionReturn(0); 3148 } 3149 /* 3150 Special case where the matrix was ILU(0) factored in the natural 3151 ordering. This eliminates the need for the column and row permutation. 3152 */ 3153 #undef __FUNCT__ 3154 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 3155 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 3156 { 3157 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3158 PetscInt n=a->mbs,*ai=a->i,*aj=a->j; 3159 PetscErrorCode ierr; 3160 PetscInt *diag = a->diag; 3161 MatScalar *aa=a->a; 3162 PetscScalar *x,*b; 3163 PetscScalar s1,x1; 3164 MatScalar *v; 3165 PetscInt jdx,idt,idx,nz,*vi,i; 3166 3167 PetscFunctionBegin; 3168 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 3169 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 3170 3171 /* forward solve the lower triangular */ 3172 idx = 0; 3173 x[0] = b[0]; 3174 for (i=1; i<n; i++) { 3175 v = aa + ai[i]; 3176 vi = aj + ai[i]; 3177 nz = diag[i] - ai[i]; 3178 idx += 1; 3179 s1 = b[idx]; 3180 while (nz--) { 3181 jdx = *vi++; 3182 x1 = x[jdx]; 3183 s1 -= v[0]*x1; 3184 v += 1; 3185 } 3186 x[idx] = s1; 3187 } 3188 /* backward solve the upper triangular */ 3189 for (i=n-1; i>=0; i--){ 3190 v = aa + diag[i] + 1; 3191 vi = aj + diag[i] + 1; 3192 nz = ai[i+1] - diag[i] - 1; 3193 idt = i; 3194 s1 = x[idt]; 3195 while (nz--) { 3196 idx = *vi++; 3197 x1 = x[idx]; 3198 s1 -= v[0]*x1; 3199 v += 1; 3200 } 3201 v = aa + diag[i]; 3202 x[idt] = v[0]*s1; 3203 } 3204 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3205 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3206 ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr); 3207 PetscFunctionReturn(0); 3208 } 3209 3210 /* ----------------------------------------------------------------*/ 3211 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption); 3212 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth); 3213 3214 extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec); 3215 #undef __FUNCT__ 3216 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct" 3217 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info) 3218 { 3219 Mat C=B; 3220 Mat_SeqBAIJ *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data; 3221 IS isrow = b->row,isicol = b->icol; 3222 PetscErrorCode ierr; 3223 const PetscInt *r,*ic,*ics; 3224 PetscInt i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j; 3225 PetscInt *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj; 3226 MatScalar *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a; 3227 PetscInt bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg; 3228 MatScalar *v_work; 3229 3230 PetscFunctionBegin; 3231 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3232 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3233 ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr); 3234 ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr); 3235 ics = ic; 3236 3237 /* generate work space needed by dense LU factorization */ 3238 ierr = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr); 3239 multiplier = v_work + bs; 3240 v_pivots = (PetscInt*)(multiplier + bs2); 3241 3242 for (i=0; i<n; i++){ 3243 /* zero rtmp */ 3244 /* L part */ 3245 nz = bi[i+1] - bi[i]; 3246 bjtmp = bj + bi[i]; 3247 for (j=0; j<nz; j++){ 3248 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3249 } 3250 3251 /* U part */ 3252 nz = bi[2*n-i+1] - bi[2*n-i]; 3253 bjtmp = bj + bi[2*n-i]; 3254 for (j=0; j<nz; j++){ 3255 ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3256 } 3257 3258 /* load in initial (unfactored row) */ 3259 nz = ai[r[i]+1] - ai[r[i]]; 3260 ajtmp = aj + ai[r[i]]; 3261 v = aa + bs2*ai[r[i]]; 3262 for (j=0; j<nz; j++) { 3263 ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr); 3264 } 3265 3266 /* elimination */ 3267 bjtmp = bj + bi[i]; 3268 row = *bjtmp++; 3269 nzL = bi[i+1] - bi[i]; 3270 k = 0; 3271 while (k < nzL) { 3272 pc = rtmp + bs2*row; 3273 for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }} 3274 if (flg) { 3275 pv = b->a + bs2*bdiag[row]; 3276 Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */ 3277 pj = b->j + bi[2*n-row]; /* begining of U(row,:) */ 3278 pv = b->a + bs2*bi[2*n-row]; 3279 nz = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */ 3280 for (j=0; j<nz; j++) { 3281 Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j); 3282 } 3283 ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr); 3284 } 3285 row = *bjtmp++; k++; 3286 } 3287 3288 /* finished row so stick it into b->a */ 3289 /* L part */ 3290 pv = b->a + bs2*bi[i] ; 3291 pj = b->j + bi[i] ; 3292 nz = bi[i+1] - bi[i]; 3293 for (j=0; j<nz; j++) { 3294 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3295 } 3296 3297 /* Mark diagonal and invert diagonal for simplier triangular solves */ 3298 pv = b->a + bs2*bdiag[i]; 3299 pj = b->j + bdiag[i]; 3300 /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */ 3301 ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3302 ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr); 3303 3304 /* U part */ 3305 pv = b->a + bs2*bi[2*n-i]; 3306 pj = b->j + bi[2*n-i]; 3307 nz = bi[2*n-i+1] - bi[2*n-i] - 1; 3308 for (j=0; j<nz; j++){ 3309 ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr); 3310 } 3311 } 3312 3313 ierr = PetscFree(rtmp);CHKERRQ(ierr); 3314 ierr = PetscFree(v_work);CHKERRQ(ierr); 3315 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3316 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3317 3318 switch (A->rmap->bs){ 3319 case 2: 3320 C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct; 3321 break; 3322 3323 case 5: 3324 C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct; 3325 break; 3326 default: 3327 C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct; 3328 break; 3329 } 3330 C->assembled = PETSC_TRUE; 3331 ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */ 3332 PetscFunctionReturn(0); 3333 } 3334 3335 /* 3336 ilu(0) with natural ordering under new data structure. 3337 Factored arrays bj and ba are stored as 3338 L(0,:), L(1,:), ...,L(n-1,:), U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:) 3339 3340 bi=fact->i is an array of size 2n+2, in which 3341 bi+ 3342 bi[i] -> 1st entry of L(i,:),i=0,...,i-1 3343 bi[n] -> end of L(n-1,:)+1 3344 bi[n+1] -> 1st entry of U(n-1,:) 3345 bi[2n-i] -> 1st entry of U(i,:) 3346 bi[2n-i+1] -> end of U(i,:)+1, the 1st entry of U(i-1,:) 3347 bi[2n] -> end of U(0,:)+1 3348 3349 U(i,:) contains diag[i] as its last entry, i.e., 3350 U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i]) 3351 */ 3352 #undef __FUNCT__ 3353 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct" 3354 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3355 { 3356 3357 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3358 PetscErrorCode ierr; 3359 PetscInt mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2; 3360 PetscInt i,j,nz=a->nz,*bi,*bj,*bdiag; 3361 3362 PetscFunctionBegin; 3363 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 3364 b = (Mat_SeqBAIJ*)(fact)->data; 3365 bdiag = b->diag; 3366 3367 /* replace matrix arrays with single allocations, then reset values */ 3368 ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr); 3369 3370 ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr); 3371 ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr); 3372 ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr); 3373 b->singlemalloc = PETSC_FALSE; 3374 if (mbs > 0) { 3375 ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr); 3376 } 3377 3378 /* set bi and bj with new data structure */ 3379 bi = b->i; 3380 bj = b->j; 3381 3382 /* L part */ 3383 bi[0] = 0; 3384 for (i=0; i<mbs; i++){ 3385 nz = adiag[i] - ai[i]; 3386 bi[i+1] = bi[i] + nz; 3387 aj = a->j + ai[i]; 3388 for (j=0; j<nz; j++){ 3389 *bj = aj[j]; bj++; 3390 } 3391 } 3392 3393 /* U part */ 3394 bi[mbs+1] = bi[mbs]; 3395 for (i=mbs-1; i>=0; i--){ 3396 nz = ai[i+1] - adiag[i] - 1; 3397 if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz); 3398 bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1; 3399 aj = a->j + adiag[i] + 1; 3400 for (j=0; j<nz; j++){ 3401 *bj = aj[j]; bj++; 3402 } 3403 /* diag[i] */ 3404 *bj = i; bj++; 3405 bdiag[i] = bi[2*mbs-i+1]-1; 3406 } 3407 PetscFunctionReturn(0); 3408 } 3409 3410 /* 3411 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3412 except that the data structure of Mat_SeqAIJ is slightly different. 3413 Not a good example of code reuse. 3414 */ 3415 3416 #undef __FUNCT__ 3417 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3418 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info) 3419 { 3420 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3421 IS isicol; 3422 PetscErrorCode ierr; 3423 const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi; 3424 PetscInt prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp; 3425 PetscInt *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0; 3426 PetscInt incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd; 3427 PetscTruth col_identity,row_identity,both_identity,flg; 3428 PetscReal f; 3429 3430 PetscFunctionBegin; 3431 ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr); 3432 if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd); 3433 3434 f = info->fill; 3435 levels = (PetscInt)info->levels; 3436 diagonal_fill = (PetscInt)info->diagonal_fill; 3437 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3438 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3439 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3440 both_identity = (PetscTruth) (row_identity && col_identity); 3441 3442 if (!levels && both_identity) { /* special case copy the nonzero structure */ 3443 3444 PetscTruth newdatastruct=PETSC_FALSE; 3445 ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr); 3446 if (newdatastruct){ 3447 ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr); 3448 (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct; 3449 } else { 3450 ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr); 3451 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 3452 } 3453 3454 fact->factor = MAT_FACTOR_ILU; 3455 b = (Mat_SeqBAIJ*)(fact)->data; 3456 b->row = isrow; 3457 b->col = iscol; 3458 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3459 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3460 b->icol = isicol; 3461 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3462 ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3463 PetscFunctionReturn(0); 3464 } 3465 3466 /* general case perform the symbolic factorization */ 3467 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3468 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3469 3470 /* get new row pointers */ 3471 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr); 3472 ainew[0] = 0; 3473 /* don't know how many column pointers are needed so estimate */ 3474 jmax = (PetscInt)(f*ai[n] + 1); 3475 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr); 3476 /* ajfill is level of fill for each fill entry */ 3477 ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr); 3478 /* fill is a linked list of nonzeros in active row */ 3479 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr); 3480 /* im is level for each filled value */ 3481 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr); 3482 /* dloc is location of diagonal in factor */ 3483 ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr); 3484 dloc[0] = 0; 3485 for (prow=0; prow<n; prow++) { 3486 3487 /* copy prow into linked list */ 3488 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3489 if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow); 3490 xi = aj + ai[r[prow]]; 3491 fill[n] = n; 3492 fill[prow] = -1; /* marker for diagonal entry */ 3493 while (nz--) { 3494 fm = n; 3495 idx = ic[*xi++]; 3496 do { 3497 m = fm; 3498 fm = fill[m]; 3499 } while (fm < idx); 3500 fill[m] = idx; 3501 fill[idx] = fm; 3502 im[idx] = 0; 3503 } 3504 3505 /* make sure diagonal entry is included */ 3506 if (diagonal_fill && fill[prow] == -1) { 3507 fm = n; 3508 while (fill[fm] < prow) fm = fill[fm]; 3509 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3510 fill[fm] = prow; 3511 im[prow] = 0; 3512 nzf++; 3513 dcount++; 3514 } 3515 3516 nzi = 0; 3517 row = fill[n]; 3518 while (row < prow) { 3519 incrlev = im[row] + 1; 3520 nz = dloc[row]; 3521 xi = ajnew + ainew[row] + nz + 1; 3522 flev = ajfill + ainew[row] + nz + 1; 3523 nnz = ainew[row+1] - ainew[row] - nz - 1; 3524 fm = row; 3525 while (nnz-- > 0) { 3526 idx = *xi++; 3527 if (*flev + incrlev > levels) { 3528 flev++; 3529 continue; 3530 } 3531 do { 3532 m = fm; 3533 fm = fill[m]; 3534 } while (fm < idx); 3535 if (fm != idx) { 3536 im[idx] = *flev + incrlev; 3537 fill[m] = idx; 3538 fill[idx] = fm; 3539 fm = idx; 3540 nzf++; 3541 } else { 3542 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3543 } 3544 flev++; 3545 } 3546 row = fill[row]; 3547 nzi++; 3548 } 3549 /* copy new filled row into permanent storage */ 3550 ainew[prow+1] = ainew[prow] + nzf; 3551 if (ainew[prow+1] > jmax) { 3552 3553 /* estimate how much additional space we will need */ 3554 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3555 /* just double the memory each time */ 3556 PetscInt maxadd = jmax; 3557 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3558 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3559 jmax += maxadd; 3560 3561 /* allocate a longer ajnew and ajfill */ 3562 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 3563 ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3564 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3565 ajnew = xitmp; 3566 ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr); 3567 ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr); 3568 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3569 ajfill = xitmp; 3570 reallocate++; /* count how many reallocations are needed */ 3571 } 3572 xitmp = ajnew + ainew[prow]; 3573 flev = ajfill + ainew[prow]; 3574 dloc[prow] = nzi; 3575 fm = fill[n]; 3576 while (nzf--) { 3577 *xitmp++ = fm; 3578 *flev++ = im[fm]; 3579 fm = fill[fm]; 3580 } 3581 /* make sure row has diagonal entry */ 3582 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3583 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\ 3584 try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow); 3585 } 3586 } 3587 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3588 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3589 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3590 ierr = PetscFree(fill);CHKERRQ(ierr); 3591 ierr = PetscFree(im);CHKERRQ(ierr); 3592 3593 #if defined(PETSC_USE_INFO) 3594 { 3595 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3596 ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr); 3597 ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr); 3598 ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr); 3599 ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr); 3600 if (diagonal_fill) { 3601 ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr); 3602 } 3603 } 3604 #endif 3605 3606 /* put together the new matrix */ 3607 ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr); 3608 ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr); 3609 b = (Mat_SeqBAIJ*)(fact)->data; 3610 b->free_a = PETSC_TRUE; 3611 b->free_ij = PETSC_TRUE; 3612 b->singlemalloc = PETSC_FALSE; 3613 ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr); 3614 b->j = ajnew; 3615 b->i = ainew; 3616 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3617 b->diag = dloc; 3618 b->ilen = 0; 3619 b->imax = 0; 3620 b->row = isrow; 3621 b->col = iscol; 3622 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3623 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3624 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3625 b->icol = isicol; 3626 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3627 /* In b structure: Free imax, ilen, old a, old j. 3628 Allocate dloc, solve_work, new a, new j */ 3629 ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr); 3630 b->maxnz = b->nz = ainew[n]; 3631 3632 (fact)->info.factor_mallocs = reallocate; 3633 (fact)->info.fill_ratio_given = f; 3634 (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3635 3636 ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr); 3637 PetscFunctionReturn(0); 3638 } 3639 3640 #undef __FUNCT__ 3641 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3642 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3643 { 3644 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3645 /* int i,*AJ=a->j,nz=a->nz; */ 3646 PetscFunctionBegin; 3647 /* Undo Column scaling */ 3648 /* while (nz--) { */ 3649 /* AJ[i] = AJ[i]/4; */ 3650 /* } */ 3651 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3652 A->ops->setunfactored = PETSC_NULL; 3653 PetscFunctionReturn(0); 3654 } 3655 3656 #undef __FUNCT__ 3657 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3658 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3659 { 3660 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3661 PetscInt *AJ=a->j,nz=a->nz; 3662 unsigned short *aj=(unsigned short *)AJ; 3663 PetscFunctionBegin; 3664 /* Is this really necessary? */ 3665 while (nz--) { 3666 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3667 } 3668 A->ops->setunfactored = PETSC_NULL; 3669 PetscFunctionReturn(0); 3670 } 3671 3672 3673