1 /*$Id: baijfact2.c,v 1.69 2001/08/06 21:15:36 bsmith Exp balay $*/ 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include "src/mat/impls/baij/seq/baij.h" 7 #include "src/vec/vecimpl.h" 8 #include "src/inline/ilu.h" 9 #include "src/inline/dot.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 17 int *diag = a->diag; 18 MatScalar *aa=a->a,*v; 19 PetscScalar s1,*x,*b; 20 21 PetscFunctionBegin; 22 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 23 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 24 25 /* forward solve the U^T */ 26 for (i=0; i<n; i++) { 27 28 v = aa + diag[i]; 29 /* multiply by the inverse of the block diagonal */ 30 s1 = (*v++)*b[i]; 31 vi = aj + diag[i] + 1; 32 nz = ai[i+1] - diag[i] - 1; 33 while (nz--) { 34 x[*vi++] -= (*v++)*s1; 35 } 36 x[i] = s1; 37 } 38 /* backward solve the L^T */ 39 for (i=n-1; i>=0; i--){ 40 v = aa + diag[i] - 1; 41 vi = aj + diag[i] - 1; 42 nz = diag[i] - ai[i]; 43 s1 = x[i]; 44 while (nz--) { 45 x[*vi--] -= (*v--)*s1; 46 } 47 } 48 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 49 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 50 PetscLogFlops(2*(a->nz) - A->n); 51 PetscFunctionReturn(0); 52 } 53 54 #undef __FUNCT__ 55 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 56 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 57 { 58 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 59 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 60 int *diag = a->diag,oidx; 61 MatScalar *aa=a->a,*v; 62 PetscScalar s1,s2,x1,x2; 63 PetscScalar *x,*b; 64 65 PetscFunctionBegin; 66 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 67 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 68 69 /* forward solve the U^T */ 70 idx = 0; 71 for (i=0; i<n; i++) { 72 73 v = aa + 4*diag[i]; 74 /* multiply by the inverse of the block diagonal */ 75 x1 = b[idx]; x2 = b[1+idx]; 76 s1 = v[0]*x1 + v[1]*x2; 77 s2 = v[2]*x1 + v[3]*x2; 78 v += 4; 79 80 vi = aj + diag[i] + 1; 81 nz = ai[i+1] - diag[i] - 1; 82 while (nz--) { 83 oidx = 2*(*vi++); 84 x[oidx] -= v[0]*s1 + v[1]*s2; 85 x[oidx+1] -= v[2]*s1 + v[3]*s2; 86 v += 4; 87 } 88 x[idx] = s1;x[1+idx] = s2; 89 idx += 2; 90 } 91 /* backward solve the L^T */ 92 for (i=n-1; i>=0; i--){ 93 v = aa + 4*diag[i] - 4; 94 vi = aj + diag[i] - 1; 95 nz = diag[i] - ai[i]; 96 idt = 2*i; 97 s1 = x[idt]; s2 = x[1+idt]; 98 while (nz--) { 99 idx = 2*(*vi--); 100 x[idx] -= v[0]*s1 + v[1]*s2; 101 x[idx+1] -= v[2]*s1 + v[3]*s2; 102 v -= 4; 103 } 104 } 105 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 106 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 107 PetscLogFlops(2*4*(a->nz) - 2*A->n); 108 PetscFunctionReturn(0); 109 } 110 111 #undef __FUNCT__ 112 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 113 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 114 { 115 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 116 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 117 int *diag = a->diag,oidx; 118 MatScalar *aa=a->a,*v; 119 PetscScalar s1,s2,s3,x1,x2,x3; 120 PetscScalar *x,*b; 121 122 PetscFunctionBegin; 123 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 124 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 125 126 /* forward solve the U^T */ 127 idx = 0; 128 for (i=0; i<n; i++) { 129 130 v = aa + 9*diag[i]; 131 /* multiply by the inverse of the block diagonal */ 132 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; 133 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 134 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 135 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 136 v += 9; 137 138 vi = aj + diag[i] + 1; 139 nz = ai[i+1] - diag[i] - 1; 140 while (nz--) { 141 oidx = 3*(*vi++); 142 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 143 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 144 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 145 v += 9; 146 } 147 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 148 idx += 3; 149 } 150 /* backward solve the L^T */ 151 for (i=n-1; i>=0; i--){ 152 v = aa + 9*diag[i] - 9; 153 vi = aj + diag[i] - 1; 154 nz = diag[i] - ai[i]; 155 idt = 3*i; 156 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 157 while (nz--) { 158 idx = 3*(*vi--); 159 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 160 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 161 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 162 v -= 9; 163 } 164 } 165 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 166 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 167 PetscLogFlops(2*9*(a->nz) - 3*A->n); 168 PetscFunctionReturn(0); 169 } 170 171 #undef __FUNCT__ 172 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 173 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 174 { 175 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 176 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 177 int *diag = a->diag,oidx; 178 MatScalar *aa=a->a,*v; 179 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 180 PetscScalar *x,*b; 181 182 PetscFunctionBegin; 183 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 184 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 185 186 /* forward solve the U^T */ 187 idx = 0; 188 for (i=0; i<n; i++) { 189 190 v = aa + 16*diag[i]; 191 /* multiply by the inverse of the block diagonal */ 192 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; 193 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 194 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 195 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 196 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 197 v += 16; 198 199 vi = aj + diag[i] + 1; 200 nz = ai[i+1] - diag[i] - 1; 201 while (nz--) { 202 oidx = 4*(*vi++); 203 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 204 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 205 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 206 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 207 v += 16; 208 } 209 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 210 idx += 4; 211 } 212 /* backward solve the L^T */ 213 for (i=n-1; i>=0; i--){ 214 v = aa + 16*diag[i] - 16; 215 vi = aj + diag[i] - 1; 216 nz = diag[i] - ai[i]; 217 idt = 4*i; 218 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 219 while (nz--) { 220 idx = 4*(*vi--); 221 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 222 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 223 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 224 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 225 v -= 16; 226 } 227 } 228 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 229 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 230 PetscLogFlops(2*16*(a->nz) - 4*A->n); 231 PetscFunctionReturn(0); 232 } 233 234 #undef __FUNCT__ 235 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 236 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 237 { 238 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 239 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 240 int *diag = a->diag,oidx; 241 MatScalar *aa=a->a,*v; 242 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 243 PetscScalar *x,*b; 244 245 PetscFunctionBegin; 246 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 247 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 248 249 /* forward solve the U^T */ 250 idx = 0; 251 for (i=0; i<n; i++) { 252 253 v = aa + 25*diag[i]; 254 /* multiply by the inverse of the block diagonal */ 255 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx]; 256 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 257 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 258 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 259 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 260 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 261 v += 25; 262 263 vi = aj + diag[i] + 1; 264 nz = ai[i+1] - diag[i] - 1; 265 while (nz--) { 266 oidx = 5*(*vi++); 267 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 268 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 269 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 270 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 271 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 272 v += 25; 273 } 274 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 275 idx += 5; 276 } 277 /* backward solve the L^T */ 278 for (i=n-1; i>=0; i--){ 279 v = aa + 25*diag[i] - 25; 280 vi = aj + diag[i] - 1; 281 nz = diag[i] - ai[i]; 282 idt = 5*i; 283 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 284 while (nz--) { 285 idx = 5*(*vi--); 286 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 287 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 288 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 289 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 290 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 291 v -= 25; 292 } 293 } 294 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 295 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 296 PetscLogFlops(2*25*(a->nz) - 5*A->n); 297 PetscFunctionReturn(0); 298 } 299 300 #undef __FUNCT__ 301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 302 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 303 { 304 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 305 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 306 int *diag = a->diag,oidx; 307 MatScalar *aa=a->a,*v; 308 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 309 PetscScalar *x,*b; 310 311 PetscFunctionBegin; 312 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 313 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 314 315 /* forward solve the U^T */ 316 idx = 0; 317 for (i=0; i<n; i++) { 318 319 v = aa + 36*diag[i]; 320 /* multiply by the inverse of the block diagonal */ 321 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx]; 322 x6 = b[5+idx]; 323 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 324 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 325 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 326 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 327 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 328 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 329 v += 36; 330 331 vi = aj + diag[i] + 1; 332 nz = ai[i+1] - diag[i] - 1; 333 while (nz--) { 334 oidx = 6*(*vi++); 335 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 336 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 337 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 338 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 339 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 340 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 341 v += 36; 342 } 343 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 344 x[5+idx] = s6; 345 idx += 6; 346 } 347 /* backward solve the L^T */ 348 for (i=n-1; i>=0; i--){ 349 v = aa + 36*diag[i] - 36; 350 vi = aj + diag[i] - 1; 351 nz = diag[i] - ai[i]; 352 idt = 6*i; 353 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 354 s6 = x[5+idt]; 355 while (nz--) { 356 idx = 6*(*vi--); 357 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 358 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 359 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 360 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 361 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 362 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 363 v -= 36; 364 } 365 } 366 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 367 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 368 PetscLogFlops(2*36*(a->nz) - 6*A->n); 369 PetscFunctionReturn(0); 370 } 371 372 #undef __FUNCT__ 373 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 374 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 375 { 376 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 377 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 378 int *diag = a->diag,oidx; 379 MatScalar *aa=a->a,*v; 380 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 381 PetscScalar *x,*b; 382 383 PetscFunctionBegin; 384 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 385 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 386 387 /* forward solve the U^T */ 388 idx = 0; 389 for (i=0; i<n; i++) { 390 391 v = aa + 49*diag[i]; 392 /* multiply by the inverse of the block diagonal */ 393 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx]; 394 x6 = b[5+idx]; x7 = b[6+idx]; 395 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 396 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 397 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 398 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 399 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 400 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 401 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 402 v += 49; 403 404 vi = aj + diag[i] + 1; 405 nz = ai[i+1] - diag[i] - 1; 406 while (nz--) { 407 oidx = 7*(*vi++); 408 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 409 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 410 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 411 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 412 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 413 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 414 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 415 v += 49; 416 } 417 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 418 x[5+idx] = s6;x[6+idx] = s7; 419 idx += 7; 420 } 421 /* backward solve the L^T */ 422 for (i=n-1; i>=0; i--){ 423 v = aa + 49*diag[i] - 49; 424 vi = aj + diag[i] - 1; 425 nz = diag[i] - ai[i]; 426 idt = 7*i; 427 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 428 s6 = x[5+idt];s7 = x[6+idt]; 429 while (nz--) { 430 idx = 7*(*vi--); 431 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 432 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 433 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 434 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 435 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 436 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 437 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 438 v -= 49; 439 } 440 } 441 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 442 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 443 PetscLogFlops(2*49*(a->nz) - 7*A->n); 444 PetscFunctionReturn(0); 445 } 446 447 /*---------------------------------------------------------------------------------------------*/ 448 #undef __FUNCT__ 449 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 450 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 451 { 452 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 453 IS iscol=a->col,isrow=a->row; 454 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 455 int *diag = a->diag; 456 MatScalar *aa=a->a,*v; 457 PetscScalar s1,*x,*b,*t; 458 459 PetscFunctionBegin; 460 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 461 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 462 t = a->solve_work; 463 464 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 465 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 466 467 /* copy the b into temp work space according to permutation */ 468 for (i=0; i<n; i++) { 469 t[i] = b[c[i]]; 470 } 471 472 /* forward solve the U^T */ 473 for (i=0; i<n; i++) { 474 475 v = aa + diag[i]; 476 /* multiply by the inverse of the block diagonal */ 477 s1 = (*v++)*t[i]; 478 vi = aj + diag[i] + 1; 479 nz = ai[i+1] - diag[i] - 1; 480 while (nz--) { 481 t[*vi++] -= (*v++)*s1; 482 } 483 t[i] = s1; 484 } 485 /* backward solve the L^T */ 486 for (i=n-1; i>=0; i--){ 487 v = aa + diag[i] - 1; 488 vi = aj + diag[i] - 1; 489 nz = diag[i] - ai[i]; 490 s1 = t[i]; 491 while (nz--) { 492 t[*vi--] -= (*v--)*s1; 493 } 494 } 495 496 /* copy t into x according to permutation */ 497 for (i=0; i<n; i++) { 498 x[r[i]] = t[i]; 499 } 500 501 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 502 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 503 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 504 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 505 PetscLogFlops(2*(a->nz) - A->n); 506 PetscFunctionReturn(0); 507 } 508 509 #undef __FUNCT__ 510 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 511 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 512 { 513 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 514 IS iscol=a->col,isrow=a->row; 515 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 516 int *diag = a->diag,ii,ic,ir,oidx; 517 MatScalar *aa=a->a,*v; 518 PetscScalar s1,s2,x1,x2; 519 PetscScalar *x,*b,*t; 520 521 PetscFunctionBegin; 522 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 523 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 524 t = a->solve_work; 525 526 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 527 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 528 529 /* copy the b into temp work space according to permutation */ 530 ii = 0; 531 for (i=0; i<n; i++) { 532 ic = 2*c[i]; 533 t[ii] = b[ic]; 534 t[ii+1] = b[ic+1]; 535 ii += 2; 536 } 537 538 /* forward solve the U^T */ 539 idx = 0; 540 for (i=0; i<n; i++) { 541 542 v = aa + 4*diag[i]; 543 /* multiply by the inverse of the block diagonal */ 544 x1 = t[idx]; x2 = t[1+idx]; 545 s1 = v[0]*x1 + v[1]*x2; 546 s2 = v[2]*x1 + v[3]*x2; 547 v += 4; 548 549 vi = aj + diag[i] + 1; 550 nz = ai[i+1] - diag[i] - 1; 551 while (nz--) { 552 oidx = 2*(*vi++); 553 t[oidx] -= v[0]*s1 + v[1]*s2; 554 t[oidx+1] -= v[2]*s1 + v[3]*s2; 555 v += 4; 556 } 557 t[idx] = s1;t[1+idx] = s2; 558 idx += 2; 559 } 560 /* backward solve the L^T */ 561 for (i=n-1; i>=0; i--){ 562 v = aa + 4*diag[i] - 4; 563 vi = aj + diag[i] - 1; 564 nz = diag[i] - ai[i]; 565 idt = 2*i; 566 s1 = t[idt]; s2 = t[1+idt]; 567 while (nz--) { 568 idx = 2*(*vi--); 569 t[idx] -= v[0]*s1 + v[1]*s2; 570 t[idx+1] -= v[2]*s1 + v[3]*s2; 571 v -= 4; 572 } 573 } 574 575 /* copy t into x according to permutation */ 576 ii = 0; 577 for (i=0; i<n; i++) { 578 ir = 2*r[i]; 579 x[ir] = t[ii]; 580 x[ir+1] = t[ii+1]; 581 ii += 2; 582 } 583 584 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 585 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 586 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 587 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 588 PetscLogFlops(2*4*(a->nz) - 2*A->n); 589 PetscFunctionReturn(0); 590 } 591 592 #undef __FUNCT__ 593 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 594 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 595 { 596 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 597 IS iscol=a->col,isrow=a->row; 598 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 599 int *diag = a->diag,ii,ic,ir,oidx; 600 MatScalar *aa=a->a,*v; 601 PetscScalar s1,s2,s3,x1,x2,x3; 602 PetscScalar *x,*b,*t; 603 604 PetscFunctionBegin; 605 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 606 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 607 t = a->solve_work; 608 609 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 610 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 611 612 /* copy the b into temp work space according to permutation */ 613 ii = 0; 614 for (i=0; i<n; i++) { 615 ic = 3*c[i]; 616 t[ii] = b[ic]; 617 t[ii+1] = b[ic+1]; 618 t[ii+2] = b[ic+2]; 619 ii += 3; 620 } 621 622 /* forward solve the U^T */ 623 idx = 0; 624 for (i=0; i<n; i++) { 625 626 v = aa + 9*diag[i]; 627 /* multiply by the inverse of the block diagonal */ 628 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 629 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 630 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 631 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 632 v += 9; 633 634 vi = aj + diag[i] + 1; 635 nz = ai[i+1] - diag[i] - 1; 636 while (nz--) { 637 oidx = 3*(*vi++); 638 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 639 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 640 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 641 v += 9; 642 } 643 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 644 idx += 3; 645 } 646 /* backward solve the L^T */ 647 for (i=n-1; i>=0; i--){ 648 v = aa + 9*diag[i] - 9; 649 vi = aj + diag[i] - 1; 650 nz = diag[i] - ai[i]; 651 idt = 3*i; 652 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 653 while (nz--) { 654 idx = 3*(*vi--); 655 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 656 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 657 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 658 v -= 9; 659 } 660 } 661 662 /* copy t into x according to permutation */ 663 ii = 0; 664 for (i=0; i<n; i++) { 665 ir = 3*r[i]; 666 x[ir] = t[ii]; 667 x[ir+1] = t[ii+1]; 668 x[ir+2] = t[ii+2]; 669 ii += 3; 670 } 671 672 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 673 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 674 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 675 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 676 PetscLogFlops(2*9*(a->nz) - 3*A->n); 677 PetscFunctionReturn(0); 678 } 679 680 #undef __FUNCT__ 681 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 682 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 683 { 684 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 685 IS iscol=a->col,isrow=a->row; 686 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 687 int *diag = a->diag,ii,ic,ir,oidx; 688 MatScalar *aa=a->a,*v; 689 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 690 PetscScalar *x,*b,*t; 691 692 PetscFunctionBegin; 693 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 694 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 695 t = a->solve_work; 696 697 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 698 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 699 700 /* copy the b into temp work space according to permutation */ 701 ii = 0; 702 for (i=0; i<n; i++) { 703 ic = 4*c[i]; 704 t[ii] = b[ic]; 705 t[ii+1] = b[ic+1]; 706 t[ii+2] = b[ic+2]; 707 t[ii+3] = b[ic+3]; 708 ii += 4; 709 } 710 711 /* forward solve the U^T */ 712 idx = 0; 713 for (i=0; i<n; i++) { 714 715 v = aa + 16*diag[i]; 716 /* multiply by the inverse of the block diagonal */ 717 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 718 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 719 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 720 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 721 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 722 v += 16; 723 724 vi = aj + diag[i] + 1; 725 nz = ai[i+1] - diag[i] - 1; 726 while (nz--) { 727 oidx = 4*(*vi++); 728 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 729 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 730 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 731 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 732 v += 16; 733 } 734 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 735 idx += 4; 736 } 737 /* backward solve the L^T */ 738 for (i=n-1; i>=0; i--){ 739 v = aa + 16*diag[i] - 16; 740 vi = aj + diag[i] - 1; 741 nz = diag[i] - ai[i]; 742 idt = 4*i; 743 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 744 while (nz--) { 745 idx = 4*(*vi--); 746 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 747 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 748 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 749 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 750 v -= 16; 751 } 752 } 753 754 /* copy t into x according to permutation */ 755 ii = 0; 756 for (i=0; i<n; i++) { 757 ir = 4*r[i]; 758 x[ir] = t[ii]; 759 x[ir+1] = t[ii+1]; 760 x[ir+2] = t[ii+2]; 761 x[ir+3] = t[ii+3]; 762 ii += 4; 763 } 764 765 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 766 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 767 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 768 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 769 PetscLogFlops(2*16*(a->nz) - 4*A->n); 770 PetscFunctionReturn(0); 771 } 772 773 #undef __FUNCT__ 774 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 775 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 776 { 777 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 778 IS iscol=a->col,isrow=a->row; 779 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 780 int *diag = a->diag,ii,ic,ir,oidx; 781 MatScalar *aa=a->a,*v; 782 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 783 PetscScalar *x,*b,*t; 784 785 PetscFunctionBegin; 786 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 787 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 788 t = a->solve_work; 789 790 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 791 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 792 793 /* copy the b into temp work space according to permutation */ 794 ii = 0; 795 for (i=0; i<n; i++) { 796 ic = 5*c[i]; 797 t[ii] = b[ic]; 798 t[ii+1] = b[ic+1]; 799 t[ii+2] = b[ic+2]; 800 t[ii+3] = b[ic+3]; 801 t[ii+4] = b[ic+4]; 802 ii += 5; 803 } 804 805 /* forward solve the U^T */ 806 idx = 0; 807 for (i=0; i<n; i++) { 808 809 v = aa + 25*diag[i]; 810 /* multiply by the inverse of the block diagonal */ 811 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 812 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 813 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 814 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 815 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 816 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 817 v += 25; 818 819 vi = aj + diag[i] + 1; 820 nz = ai[i+1] - diag[i] - 1; 821 while (nz--) { 822 oidx = 5*(*vi++); 823 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 824 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 825 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 826 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 827 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 828 v += 25; 829 } 830 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 831 idx += 5; 832 } 833 /* backward solve the L^T */ 834 for (i=n-1; i>=0; i--){ 835 v = aa + 25*diag[i] - 25; 836 vi = aj + diag[i] - 1; 837 nz = diag[i] - ai[i]; 838 idt = 5*i; 839 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 840 while (nz--) { 841 idx = 5*(*vi--); 842 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 843 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 844 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 845 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 846 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 847 v -= 25; 848 } 849 } 850 851 /* copy t into x according to permutation */ 852 ii = 0; 853 for (i=0; i<n; i++) { 854 ir = 5*r[i]; 855 x[ir] = t[ii]; 856 x[ir+1] = t[ii+1]; 857 x[ir+2] = t[ii+2]; 858 x[ir+3] = t[ii+3]; 859 x[ir+4] = t[ii+4]; 860 ii += 5; 861 } 862 863 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 864 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 865 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 866 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 867 PetscLogFlops(2*25*(a->nz) - 5*A->n); 868 PetscFunctionReturn(0); 869 } 870 871 #undef __FUNCT__ 872 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 873 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 874 { 875 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 876 IS iscol=a->col,isrow=a->row; 877 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 878 int *diag = a->diag,ii,ic,ir,oidx; 879 MatScalar *aa=a->a,*v; 880 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 881 PetscScalar *x,*b,*t; 882 883 PetscFunctionBegin; 884 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 885 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 886 t = a->solve_work; 887 888 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 889 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 890 891 /* copy the b into temp work space according to permutation */ 892 ii = 0; 893 for (i=0; i<n; i++) { 894 ic = 6*c[i]; 895 t[ii] = b[ic]; 896 t[ii+1] = b[ic+1]; 897 t[ii+2] = b[ic+2]; 898 t[ii+3] = b[ic+3]; 899 t[ii+4] = b[ic+4]; 900 t[ii+5] = b[ic+5]; 901 ii += 6; 902 } 903 904 /* forward solve the U^T */ 905 idx = 0; 906 for (i=0; i<n; i++) { 907 908 v = aa + 36*diag[i]; 909 /* multiply by the inverse of the block diagonal */ 910 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 911 x6 = t[5+idx]; 912 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 913 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 914 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 915 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 916 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 917 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 918 v += 36; 919 920 vi = aj + diag[i] + 1; 921 nz = ai[i+1] - diag[i] - 1; 922 while (nz--) { 923 oidx = 6*(*vi++); 924 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 925 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 926 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 927 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 928 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 929 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 930 v += 36; 931 } 932 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 933 t[5+idx] = s6; 934 idx += 6; 935 } 936 /* backward solve the L^T */ 937 for (i=n-1; i>=0; i--){ 938 v = aa + 36*diag[i] - 36; 939 vi = aj + diag[i] - 1; 940 nz = diag[i] - ai[i]; 941 idt = 6*i; 942 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 943 s6 = t[5+idt]; 944 while (nz--) { 945 idx = 6*(*vi--); 946 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 947 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 948 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 949 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 950 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 951 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 952 v -= 36; 953 } 954 } 955 956 /* copy t into x according to permutation */ 957 ii = 0; 958 for (i=0; i<n; i++) { 959 ir = 6*r[i]; 960 x[ir] = t[ii]; 961 x[ir+1] = t[ii+1]; 962 x[ir+2] = t[ii+2]; 963 x[ir+3] = t[ii+3]; 964 x[ir+4] = t[ii+4]; 965 x[ir+5] = t[ii+5]; 966 ii += 6; 967 } 968 969 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 970 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 971 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 972 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 973 PetscLogFlops(2*36*(a->nz) - 6*A->n); 974 PetscFunctionReturn(0); 975 } 976 977 #undef __FUNCT__ 978 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 979 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 980 { 981 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 982 IS iscol=a->col,isrow=a->row; 983 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 984 int *diag = a->diag,ii,ic,ir,oidx; 985 MatScalar *aa=a->a,*v; 986 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 987 PetscScalar *x,*b,*t; 988 989 PetscFunctionBegin; 990 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 991 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 992 t = a->solve_work; 993 994 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 995 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 996 997 /* copy the b into temp work space according to permutation */ 998 ii = 0; 999 for (i=0; i<n; i++) { 1000 ic = 7*c[i]; 1001 t[ii] = b[ic]; 1002 t[ii+1] = b[ic+1]; 1003 t[ii+2] = b[ic+2]; 1004 t[ii+3] = b[ic+3]; 1005 t[ii+4] = b[ic+4]; 1006 t[ii+5] = b[ic+5]; 1007 t[ii+6] = b[ic+6]; 1008 ii += 7; 1009 } 1010 1011 /* forward solve the U^T */ 1012 idx = 0; 1013 for (i=0; i<n; i++) { 1014 1015 v = aa + 49*diag[i]; 1016 /* multiply by the inverse of the block diagonal */ 1017 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1018 x6 = t[5+idx]; x7 = t[6+idx]; 1019 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1020 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1021 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1022 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1023 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1024 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1025 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1026 v += 49; 1027 1028 vi = aj + diag[i] + 1; 1029 nz = ai[i+1] - diag[i] - 1; 1030 while (nz--) { 1031 oidx = 7*(*vi++); 1032 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1033 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1034 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1035 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1036 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1037 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1038 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1039 v += 49; 1040 } 1041 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1042 t[5+idx] = s6;t[6+idx] = s7; 1043 idx += 7; 1044 } 1045 /* backward solve the L^T */ 1046 for (i=n-1; i>=0; i--){ 1047 v = aa + 49*diag[i] - 49; 1048 vi = aj + diag[i] - 1; 1049 nz = diag[i] - ai[i]; 1050 idt = 7*i; 1051 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1052 s6 = t[5+idt];s7 = t[6+idt]; 1053 while (nz--) { 1054 idx = 7*(*vi--); 1055 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1056 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1057 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1058 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1059 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1060 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1061 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1062 v -= 49; 1063 } 1064 } 1065 1066 /* copy t into x according to permutation */ 1067 ii = 0; 1068 for (i=0; i<n; i++) { 1069 ir = 7*r[i]; 1070 x[ir] = t[ii]; 1071 x[ir+1] = t[ii+1]; 1072 x[ir+2] = t[ii+2]; 1073 x[ir+3] = t[ii+3]; 1074 x[ir+4] = t[ii+4]; 1075 x[ir+5] = t[ii+5]; 1076 x[ir+6] = t[ii+6]; 1077 ii += 7; 1078 } 1079 1080 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1081 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1082 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1083 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1084 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1085 PetscFunctionReturn(0); 1086 } 1087 1088 /* ----------------------------------------------------------- */ 1089 #undef __FUNCT__ 1090 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1091 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1092 { 1093 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1094 IS iscol=a->col,isrow=a->row; 1095 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1096 int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 1097 MatScalar *aa=a->a,*v; 1098 PetscScalar *x,*b,*s,*t,*ls; 1099 1100 PetscFunctionBegin; 1101 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1102 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1103 t = a->solve_work; 1104 1105 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1106 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1107 1108 /* forward solve the lower triangular */ 1109 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1110 for (i=1; i<n; i++) { 1111 v = aa + bs2*ai[i]; 1112 vi = aj + ai[i]; 1113 nz = a->diag[i] - ai[i]; 1114 s = t + bs*i; 1115 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1116 while (nz--) { 1117 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1118 v += bs2; 1119 } 1120 } 1121 /* backward solve the upper triangular */ 1122 ls = a->solve_work + A->n; 1123 for (i=n-1; i>=0; i--){ 1124 v = aa + bs2*(a->diag[i] + 1); 1125 vi = aj + a->diag[i] + 1; 1126 nz = ai[i+1] - a->diag[i] - 1; 1127 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1128 while (nz--) { 1129 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1130 v += bs2; 1131 } 1132 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1133 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1134 } 1135 1136 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1137 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1138 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1139 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1140 PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 1141 PetscFunctionReturn(0); 1142 } 1143 1144 #undef __FUNCT__ 1145 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1146 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1147 { 1148 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1149 IS iscol=a->col,isrow=a->row; 1150 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1151 int *diag = a->diag; 1152 MatScalar *aa=a->a,*v; 1153 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1154 PetscScalar *x,*b,*t; 1155 1156 PetscFunctionBegin; 1157 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1158 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1159 t = a->solve_work; 1160 1161 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1162 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1163 1164 /* forward solve the lower triangular */ 1165 idx = 7*(*r++); 1166 t[0] = b[idx]; t[1] = b[1+idx]; 1167 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1168 t[5] = b[5+idx]; t[6] = b[6+idx]; 1169 1170 for (i=1; i<n; i++) { 1171 v = aa + 49*ai[i]; 1172 vi = aj + ai[i]; 1173 nz = diag[i] - ai[i]; 1174 idx = 7*(*r++); 1175 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1176 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1177 while (nz--) { 1178 idx = 7*(*vi++); 1179 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1180 x4 = t[3+idx];x5 = t[4+idx]; 1181 x6 = t[5+idx];x7 = t[6+idx]; 1182 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1183 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1184 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1185 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1186 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1187 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1188 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1189 v += 49; 1190 } 1191 idx = 7*i; 1192 t[idx] = s1;t[1+idx] = s2; 1193 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1194 t[5+idx] = s6;t[6+idx] = s7; 1195 } 1196 /* backward solve the upper triangular */ 1197 for (i=n-1; i>=0; i--){ 1198 v = aa + 49*diag[i] + 49; 1199 vi = aj + diag[i] + 1; 1200 nz = ai[i+1] - diag[i] - 1; 1201 idt = 7*i; 1202 s1 = t[idt]; s2 = t[1+idt]; 1203 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1204 s6 = t[5+idt];s7 = t[6+idt]; 1205 while (nz--) { 1206 idx = 7*(*vi++); 1207 x1 = t[idx]; x2 = t[1+idx]; 1208 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1209 x6 = t[5+idx]; x7 = t[6+idx]; 1210 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1211 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1212 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1213 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1214 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1215 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1216 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1217 v += 49; 1218 } 1219 idc = 7*(*c--); 1220 v = aa + 49*diag[i]; 1221 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1222 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1223 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1224 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1225 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1226 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1227 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1228 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1229 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1230 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1231 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1232 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1233 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1234 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1235 } 1236 1237 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1238 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1239 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1240 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1241 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1242 PetscFunctionReturn(0); 1243 } 1244 1245 #undef __FUNCT__ 1246 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1247 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1248 { 1249 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1250 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1251 int ierr,*diag = a->diag,jdx; 1252 MatScalar *aa=a->a,*v; 1253 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1254 1255 PetscFunctionBegin; 1256 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1257 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1258 /* forward solve the lower triangular */ 1259 idx = 0; 1260 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1261 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1262 x[6] = b[6+idx]; 1263 for (i=1; i<n; i++) { 1264 v = aa + 49*ai[i]; 1265 vi = aj + ai[i]; 1266 nz = diag[i] - ai[i]; 1267 idx = 7*i; 1268 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1269 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1270 s7 = b[6+idx]; 1271 while (nz--) { 1272 jdx = 7*(*vi++); 1273 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1274 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1275 x7 = x[6+jdx]; 1276 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1277 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1278 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1279 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1280 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1281 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1282 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1283 v += 49; 1284 } 1285 x[idx] = s1; 1286 x[1+idx] = s2; 1287 x[2+idx] = s3; 1288 x[3+idx] = s4; 1289 x[4+idx] = s5; 1290 x[5+idx] = s6; 1291 x[6+idx] = s7; 1292 } 1293 /* backward solve the upper triangular */ 1294 for (i=n-1; i>=0; i--){ 1295 v = aa + 49*diag[i] + 49; 1296 vi = aj + diag[i] + 1; 1297 nz = ai[i+1] - diag[i] - 1; 1298 idt = 7*i; 1299 s1 = x[idt]; s2 = x[1+idt]; 1300 s3 = x[2+idt]; s4 = x[3+idt]; 1301 s5 = x[4+idt]; s6 = x[5+idt]; 1302 s7 = x[6+idt]; 1303 while (nz--) { 1304 idx = 7*(*vi++); 1305 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1306 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1307 x7 = x[6+idx]; 1308 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1309 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1310 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1311 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1312 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1313 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1314 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1315 v += 49; 1316 } 1317 v = aa + 49*diag[i]; 1318 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1319 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1320 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1321 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1322 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1323 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1324 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1325 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1326 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1327 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1328 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1329 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1330 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1331 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1332 } 1333 1334 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1335 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1336 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1337 PetscFunctionReturn(0); 1338 } 1339 1340 #undef __FUNCT__ 1341 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1342 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1343 { 1344 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1345 IS iscol=a->col,isrow=a->row; 1346 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1347 int *diag = a->diag; 1348 MatScalar *aa=a->a,*v; 1349 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1350 1351 PetscFunctionBegin; 1352 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1353 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1354 t = a->solve_work; 1355 1356 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1357 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1358 1359 /* forward solve the lower triangular */ 1360 idx = 6*(*r++); 1361 t[0] = b[idx]; t[1] = b[1+idx]; 1362 t[2] = b[2+idx]; t[3] = b[3+idx]; 1363 t[4] = b[4+idx]; t[5] = b[5+idx]; 1364 for (i=1; i<n; i++) { 1365 v = aa + 36*ai[i]; 1366 vi = aj + ai[i]; 1367 nz = diag[i] - ai[i]; 1368 idx = 6*(*r++); 1369 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1370 s5 = b[4+idx]; s6 = b[5+idx]; 1371 while (nz--) { 1372 idx = 6*(*vi++); 1373 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1374 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1375 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1376 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1377 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1378 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1379 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1380 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1381 v += 36; 1382 } 1383 idx = 6*i; 1384 t[idx] = s1;t[1+idx] = s2; 1385 t[2+idx] = s3;t[3+idx] = s4; 1386 t[4+idx] = s5;t[5+idx] = s6; 1387 } 1388 /* backward solve the upper triangular */ 1389 for (i=n-1; i>=0; i--){ 1390 v = aa + 36*diag[i] + 36; 1391 vi = aj + diag[i] + 1; 1392 nz = ai[i+1] - diag[i] - 1; 1393 idt = 6*i; 1394 s1 = t[idt]; s2 = t[1+idt]; 1395 s3 = t[2+idt];s4 = t[3+idt]; 1396 s5 = t[4+idt];s6 = t[5+idt]; 1397 while (nz--) { 1398 idx = 6*(*vi++); 1399 x1 = t[idx]; x2 = t[1+idx]; 1400 x3 = t[2+idx]; x4 = t[3+idx]; 1401 x5 = t[4+idx]; x6 = t[5+idx]; 1402 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1403 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1404 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1405 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1406 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1407 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1408 v += 36; 1409 } 1410 idc = 6*(*c--); 1411 v = aa + 36*diag[i]; 1412 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1413 v[18]*s4+v[24]*s5+v[30]*s6; 1414 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1415 v[19]*s4+v[25]*s5+v[31]*s6; 1416 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1417 v[20]*s4+v[26]*s5+v[32]*s6; 1418 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1419 v[21]*s4+v[27]*s5+v[33]*s6; 1420 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1421 v[22]*s4+v[28]*s5+v[34]*s6; 1422 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1423 v[23]*s4+v[29]*s5+v[35]*s6; 1424 } 1425 1426 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1427 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1428 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1429 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1430 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1431 PetscFunctionReturn(0); 1432 } 1433 1434 #undef __FUNCT__ 1435 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1436 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1437 { 1438 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1439 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1440 int ierr,*diag = a->diag,jdx; 1441 MatScalar *aa=a->a,*v; 1442 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1443 1444 PetscFunctionBegin; 1445 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1446 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1447 /* forward solve the lower triangular */ 1448 idx = 0; 1449 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1450 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1451 for (i=1; i<n; i++) { 1452 v = aa + 36*ai[i]; 1453 vi = aj + ai[i]; 1454 nz = diag[i] - ai[i]; 1455 idx = 6*i; 1456 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1457 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1458 while (nz--) { 1459 jdx = 6*(*vi++); 1460 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1461 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1462 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1463 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1464 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1465 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1466 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1467 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1468 v += 36; 1469 } 1470 x[idx] = s1; 1471 x[1+idx] = s2; 1472 x[2+idx] = s3; 1473 x[3+idx] = s4; 1474 x[4+idx] = s5; 1475 x[5+idx] = s6; 1476 } 1477 /* backward solve the upper triangular */ 1478 for (i=n-1; i>=0; i--){ 1479 v = aa + 36*diag[i] + 36; 1480 vi = aj + diag[i] + 1; 1481 nz = ai[i+1] - diag[i] - 1; 1482 idt = 6*i; 1483 s1 = x[idt]; s2 = x[1+idt]; 1484 s3 = x[2+idt]; s4 = x[3+idt]; 1485 s5 = x[4+idt]; s6 = x[5+idt]; 1486 while (nz--) { 1487 idx = 6*(*vi++); 1488 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1489 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1490 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1491 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1492 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1493 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1494 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1495 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1496 v += 36; 1497 } 1498 v = aa + 36*diag[i]; 1499 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1500 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1501 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1502 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1503 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1504 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1505 } 1506 1507 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1508 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1509 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1510 PetscFunctionReturn(0); 1511 } 1512 1513 #undef __FUNCT__ 1514 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1515 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1516 { 1517 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1518 IS iscol=a->col,isrow=a->row; 1519 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1520 int *diag = a->diag; 1521 MatScalar *aa=a->a,*v; 1522 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1523 1524 PetscFunctionBegin; 1525 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1526 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1527 t = a->solve_work; 1528 1529 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1530 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1531 1532 /* forward solve the lower triangular */ 1533 idx = 5*(*r++); 1534 t[0] = b[idx]; t[1] = b[1+idx]; 1535 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1536 for (i=1; i<n; i++) { 1537 v = aa + 25*ai[i]; 1538 vi = aj + ai[i]; 1539 nz = diag[i] - ai[i]; 1540 idx = 5*(*r++); 1541 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1542 s5 = b[4+idx]; 1543 while (nz--) { 1544 idx = 5*(*vi++); 1545 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1546 x4 = t[3+idx];x5 = t[4+idx]; 1547 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1548 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1549 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1550 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1551 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1552 v += 25; 1553 } 1554 idx = 5*i; 1555 t[idx] = s1;t[1+idx] = s2; 1556 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1557 } 1558 /* backward solve the upper triangular */ 1559 for (i=n-1; i>=0; i--){ 1560 v = aa + 25*diag[i] + 25; 1561 vi = aj + diag[i] + 1; 1562 nz = ai[i+1] - diag[i] - 1; 1563 idt = 5*i; 1564 s1 = t[idt]; s2 = t[1+idt]; 1565 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1566 while (nz--) { 1567 idx = 5*(*vi++); 1568 x1 = t[idx]; x2 = t[1+idx]; 1569 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1570 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1571 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1572 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1573 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1574 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1575 v += 25; 1576 } 1577 idc = 5*(*c--); 1578 v = aa + 25*diag[i]; 1579 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1580 v[15]*s4+v[20]*s5; 1581 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1582 v[16]*s4+v[21]*s5; 1583 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1584 v[17]*s4+v[22]*s5; 1585 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1586 v[18]*s4+v[23]*s5; 1587 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1588 v[19]*s4+v[24]*s5; 1589 } 1590 1591 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1592 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1593 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1594 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1595 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1596 PetscFunctionReturn(0); 1597 } 1598 1599 #undef __FUNCT__ 1600 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1601 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1602 { 1603 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1604 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1605 int ierr,*diag = a->diag,jdx; 1606 MatScalar *aa=a->a,*v; 1607 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1608 1609 PetscFunctionBegin; 1610 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1611 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1612 /* forward solve the lower triangular */ 1613 idx = 0; 1614 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1615 for (i=1; i<n; i++) { 1616 v = aa + 25*ai[i]; 1617 vi = aj + ai[i]; 1618 nz = diag[i] - ai[i]; 1619 idx = 5*i; 1620 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1621 while (nz--) { 1622 jdx = 5*(*vi++); 1623 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1624 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1625 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1626 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1627 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1628 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1629 v += 25; 1630 } 1631 x[idx] = s1; 1632 x[1+idx] = s2; 1633 x[2+idx] = s3; 1634 x[3+idx] = s4; 1635 x[4+idx] = s5; 1636 } 1637 /* backward solve the upper triangular */ 1638 for (i=n-1; i>=0; i--){ 1639 v = aa + 25*diag[i] + 25; 1640 vi = aj + diag[i] + 1; 1641 nz = ai[i+1] - diag[i] - 1; 1642 idt = 5*i; 1643 s1 = x[idt]; s2 = x[1+idt]; 1644 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1645 while (nz--) { 1646 idx = 5*(*vi++); 1647 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1648 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1649 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1650 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1651 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1652 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1653 v += 25; 1654 } 1655 v = aa + 25*diag[i]; 1656 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1657 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1658 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1659 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1660 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1661 } 1662 1663 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1664 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1665 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1666 PetscFunctionReturn(0); 1667 } 1668 1669 #undef __FUNCT__ 1670 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1671 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1672 { 1673 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1674 IS iscol=a->col,isrow=a->row; 1675 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1676 int *diag = a->diag; 1677 MatScalar *aa=a->a,*v; 1678 PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1679 1680 PetscFunctionBegin; 1681 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1682 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1683 t = a->solve_work; 1684 1685 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1686 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1687 1688 /* forward solve the lower triangular */ 1689 idx = 4*(*r++); 1690 t[0] = b[idx]; t[1] = b[1+idx]; 1691 t[2] = b[2+idx]; t[3] = b[3+idx]; 1692 for (i=1; i<n; i++) { 1693 v = aa + 16*ai[i]; 1694 vi = aj + ai[i]; 1695 nz = diag[i] - ai[i]; 1696 idx = 4*(*r++); 1697 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1698 while (nz--) { 1699 idx = 4*(*vi++); 1700 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1701 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1702 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1703 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1704 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1705 v += 16; 1706 } 1707 idx = 4*i; 1708 t[idx] = s1;t[1+idx] = s2; 1709 t[2+idx] = s3;t[3+idx] = s4; 1710 } 1711 /* backward solve the upper triangular */ 1712 for (i=n-1; i>=0; i--){ 1713 v = aa + 16*diag[i] + 16; 1714 vi = aj + diag[i] + 1; 1715 nz = ai[i+1] - diag[i] - 1; 1716 idt = 4*i; 1717 s1 = t[idt]; s2 = t[1+idt]; 1718 s3 = t[2+idt];s4 = t[3+idt]; 1719 while (nz--) { 1720 idx = 4*(*vi++); 1721 x1 = t[idx]; x2 = t[1+idx]; 1722 x3 = t[2+idx]; x4 = t[3+idx]; 1723 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1724 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1725 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1726 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1727 v += 16; 1728 } 1729 idc = 4*(*c--); 1730 v = aa + 16*diag[i]; 1731 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1732 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1733 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1734 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1735 } 1736 1737 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1738 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1739 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1740 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1741 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1742 PetscFunctionReturn(0); 1743 } 1744 #if defined (PETSC_HAVE_SSE) 1745 1746 #include PETSC_HAVE_SSE 1747 1748 #undef __FUNCT__ 1749 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1750 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1751 { 1752 /* 1753 Note: This code uses demotion of double 1754 to float when performing the mixed-mode computation. 1755 This may not be numerically reasonable for all applications. 1756 */ 1757 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1758 IS iscol=a->col,isrow=a->row; 1759 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1760 int *diag = a->diag,ai16; 1761 MatScalar *aa=a->a,*v; 1762 PetscScalar *x,*b,*t; 1763 1764 /* Make space in temp stack for 16 Byte Aligned arrays */ 1765 float ssealignedspace[11],*tmps,*tmpx; 1766 unsigned long offset; 1767 1768 PetscFunctionBegin; 1769 SSE_SCOPE_BEGIN; 1770 1771 offset = (unsigned long)ssealignedspace % 16; 1772 if (offset) offset = (16 - offset)/4; 1773 tmps = &ssealignedspace[offset]; 1774 tmpx = &ssealignedspace[offset+4]; 1775 PREFETCH_NTA(aa+16*ai[1]); 1776 1777 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1778 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1779 t = a->solve_work; 1780 1781 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1782 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1783 1784 /* forward solve the lower triangular */ 1785 idx = 4*(*r++); 1786 t[0] = b[idx]; t[1] = b[1+idx]; 1787 t[2] = b[2+idx]; t[3] = b[3+idx]; 1788 v = aa + 16*ai[1]; 1789 1790 for (i=1; i<n;) { 1791 PREFETCH_NTA(&v[8]); 1792 vi = aj + ai[i]; 1793 nz = diag[i] - ai[i]; 1794 idx = 4*(*r++); 1795 1796 /* Demote sum from double to float */ 1797 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1798 LOAD_PS(tmps,XMM7); 1799 1800 while (nz--) { 1801 PREFETCH_NTA(&v[16]); 1802 idx = 4*(*vi++); 1803 1804 /* Demote solution (so far) from double to float */ 1805 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1806 1807 /* 4x4 Matrix-Vector product with negative accumulation: */ 1808 SSE_INLINE_BEGIN_2(tmpx,v) 1809 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1810 1811 /* First Column */ 1812 SSE_COPY_PS(XMM0,XMM6) 1813 SSE_SHUFFLE(XMM0,XMM0,0x00) 1814 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1815 SSE_SUB_PS(XMM7,XMM0) 1816 1817 /* Second Column */ 1818 SSE_COPY_PS(XMM1,XMM6) 1819 SSE_SHUFFLE(XMM1,XMM1,0x55) 1820 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1821 SSE_SUB_PS(XMM7,XMM1) 1822 1823 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1824 1825 /* Third Column */ 1826 SSE_COPY_PS(XMM2,XMM6) 1827 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1828 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1829 SSE_SUB_PS(XMM7,XMM2) 1830 1831 /* Fourth Column */ 1832 SSE_COPY_PS(XMM3,XMM6) 1833 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1834 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1835 SSE_SUB_PS(XMM7,XMM3) 1836 SSE_INLINE_END_2 1837 1838 v += 16; 1839 } 1840 idx = 4*i; 1841 v = aa + 16*ai[++i]; 1842 PREFETCH_NTA(v); 1843 STORE_PS(tmps,XMM7); 1844 1845 /* Promote result from float to double */ 1846 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1847 } 1848 /* backward solve the upper triangular */ 1849 idt = 4*(n-1); 1850 ai16 = 16*diag[n-1]; 1851 v = aa + ai16 + 16; 1852 for (i=n-1; i>=0;){ 1853 PREFETCH_NTA(&v[8]); 1854 vi = aj + diag[i] + 1; 1855 nz = ai[i+1] - diag[i] - 1; 1856 1857 /* Demote accumulator from double to float */ 1858 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1859 LOAD_PS(tmps,XMM7); 1860 1861 while (nz--) { 1862 PREFETCH_NTA(&v[16]); 1863 idx = 4*(*vi++); 1864 1865 /* Demote solution (so far) from double to float */ 1866 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 1867 1868 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1869 SSE_INLINE_BEGIN_2(tmpx,v) 1870 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1871 1872 /* First Column */ 1873 SSE_COPY_PS(XMM0,XMM6) 1874 SSE_SHUFFLE(XMM0,XMM0,0x00) 1875 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1876 SSE_SUB_PS(XMM7,XMM0) 1877 1878 /* Second Column */ 1879 SSE_COPY_PS(XMM1,XMM6) 1880 SSE_SHUFFLE(XMM1,XMM1,0x55) 1881 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1882 SSE_SUB_PS(XMM7,XMM1) 1883 1884 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1885 1886 /* Third Column */ 1887 SSE_COPY_PS(XMM2,XMM6) 1888 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1889 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1890 SSE_SUB_PS(XMM7,XMM2) 1891 1892 /* Fourth Column */ 1893 SSE_COPY_PS(XMM3,XMM6) 1894 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1895 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1896 SSE_SUB_PS(XMM7,XMM3) 1897 SSE_INLINE_END_2 1898 v += 16; 1899 } 1900 v = aa + ai16; 1901 ai16 = 16*diag[--i]; 1902 PREFETCH_NTA(aa+ai16+16); 1903 /* 1904 Scale the result by the diagonal 4x4 block, 1905 which was inverted as part of the factorization 1906 */ 1907 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 1908 /* First Column */ 1909 SSE_COPY_PS(XMM0,XMM7) 1910 SSE_SHUFFLE(XMM0,XMM0,0x00) 1911 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 1912 1913 /* Second Column */ 1914 SSE_COPY_PS(XMM1,XMM7) 1915 SSE_SHUFFLE(XMM1,XMM1,0x55) 1916 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 1917 SSE_ADD_PS(XMM0,XMM1) 1918 1919 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 1920 1921 /* Third Column */ 1922 SSE_COPY_PS(XMM2,XMM7) 1923 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1924 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 1925 SSE_ADD_PS(XMM0,XMM2) 1926 1927 /* Fourth Column */ 1928 SSE_COPY_PS(XMM3,XMM7) 1929 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1930 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 1931 SSE_ADD_PS(XMM0,XMM3) 1932 1933 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 1934 SSE_INLINE_END_3 1935 1936 /* Promote solution from float to double */ 1937 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 1938 1939 /* Apply reordering to t and stream into x. */ 1940 /* This way, x doesn't pollute the cache. */ 1941 /* Be careful with size: 2 doubles = 4 floats! */ 1942 idc = 4*(*c--); 1943 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 1944 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 1945 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 1946 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 1947 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 1948 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 1949 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 1950 SSE_INLINE_END_2 1951 v = aa + ai16 + 16; 1952 idt -= 4; 1953 } 1954 1955 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1956 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1957 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1958 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1959 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1960 SSE_SCOPE_END; 1961 PetscFunctionReturn(0); 1962 } 1963 1964 #endif 1965 1966 1967 /* 1968 Special case where the matrix was ILU(0) factored in the natural 1969 ordering. This eliminates the need for the column and row permutation. 1970 */ 1971 #undef __FUNCT__ 1972 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 1973 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 1974 { 1975 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1976 int n=a->mbs,*ai=a->i,*aj=a->j; 1977 int ierr,*diag = a->diag; 1978 MatScalar *aa=a->a; 1979 PetscScalar *x,*b; 1980 1981 PetscFunctionBegin; 1982 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1983 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1984 1985 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 1986 { 1987 static PetscScalar w[2000]; /* very BAD need to fix */ 1988 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 1989 } 1990 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 1991 { 1992 static PetscScalar w[2000]; /* very BAD need to fix */ 1993 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 1994 } 1995 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 1996 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 1997 #else 1998 { 1999 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2000 MatScalar *v; 2001 int jdx,idt,idx,nz,*vi,i,ai16; 2002 2003 /* forward solve the lower triangular */ 2004 idx = 0; 2005 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2006 for (i=1; i<n; i++) { 2007 v = aa + 16*ai[i]; 2008 vi = aj + ai[i]; 2009 nz = diag[i] - ai[i]; 2010 idx += 4; 2011 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2012 while (nz--) { 2013 jdx = 4*(*vi++); 2014 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2015 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2016 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2017 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2018 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2019 v += 16; 2020 } 2021 x[idx] = s1; 2022 x[1+idx] = s2; 2023 x[2+idx] = s3; 2024 x[3+idx] = s4; 2025 } 2026 /* backward solve the upper triangular */ 2027 idt = 4*(n-1); 2028 for (i=n-1; i>=0; i--){ 2029 ai16 = 16*diag[i]; 2030 v = aa + ai16 + 16; 2031 vi = aj + diag[i] + 1; 2032 nz = ai[i+1] - diag[i] - 1; 2033 s1 = x[idt]; s2 = x[1+idt]; 2034 s3 = x[2+idt];s4 = x[3+idt]; 2035 while (nz--) { 2036 idx = 4*(*vi++); 2037 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2038 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2039 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2040 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2041 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2042 v += 16; 2043 } 2044 v = aa + ai16; 2045 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2046 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2047 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2048 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2049 idt -= 4; 2050 } 2051 } 2052 #endif 2053 2054 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2055 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2056 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2057 PetscFunctionReturn(0); 2058 } 2059 2060 #if defined (PETSC_HAVE_SSE) 2061 2062 #include PETSC_HAVE_SSE 2063 2064 #undef __FUNCT__ 2065 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2066 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2067 { 2068 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2069 int n=a->mbs,*ai=a->i,*aj=a->j; 2070 int ierr,*diag = a->diag; 2071 MatScalar *aa=a->a; 2072 PetscScalar *x,*b; 2073 2074 PetscFunctionBegin; 2075 SSE_SCOPE_BEGIN; 2076 /* 2077 Note: This code currently uses demotion of double 2078 to float when performing the mixed-mode computation. 2079 This may not be numerically reasonable for all applications. 2080 */ 2081 PREFETCH_NTA(aa+16*ai[1]); 2082 2083 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2084 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2085 { 2086 MatScalar *v; 2087 int jdx,idt,idx,nz,*vi,i,ai16; 2088 2089 /* Make space in temp stack for 16 Byte Aligned arrays */ 2090 float ssealignedspace[11],*tmps,*tmpx; 2091 unsigned long offset = (unsigned long)ssealignedspace % 16; 2092 if (offset) offset = (16 - offset)/4; 2093 tmps = &ssealignedspace[offset]; 2094 tmpx = &ssealignedspace[offset+4]; 2095 2096 /* forward solve the lower triangular */ 2097 idx = 0; 2098 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2099 v = aa + 16*ai[1]; 2100 2101 for (i=1; i<n;) { 2102 PREFETCH_NTA(&v[8]); 2103 vi = aj + ai[i]; 2104 nz = diag[i] - ai[i]; 2105 idx += 4; 2106 2107 /* Demote sum from double to float */ 2108 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 2109 LOAD_PS(tmps,XMM7); 2110 2111 while (nz--) { 2112 PREFETCH_NTA(&v[16]); 2113 jdx = 4*(*vi++); 2114 2115 /* Demote solution (so far) from double to float */ 2116 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]); 2117 2118 /* 4x4 Matrix-Vector product with negative accumulation: */ 2119 SSE_INLINE_BEGIN_2(tmpx,v) 2120 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2121 2122 /* First Column */ 2123 SSE_COPY_PS(XMM0,XMM6) 2124 SSE_SHUFFLE(XMM0,XMM0,0x00) 2125 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2126 SSE_SUB_PS(XMM7,XMM0) 2127 2128 /* Second Column */ 2129 SSE_COPY_PS(XMM1,XMM6) 2130 SSE_SHUFFLE(XMM1,XMM1,0x55) 2131 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2132 SSE_SUB_PS(XMM7,XMM1) 2133 2134 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2135 2136 /* Third Column */ 2137 SSE_COPY_PS(XMM2,XMM6) 2138 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2139 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2140 SSE_SUB_PS(XMM7,XMM2) 2141 2142 /* Fourth Column */ 2143 SSE_COPY_PS(XMM3,XMM6) 2144 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2145 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2146 SSE_SUB_PS(XMM7,XMM3) 2147 SSE_INLINE_END_2 2148 2149 v += 16; 2150 } 2151 v = aa + 16*ai[++i]; 2152 PREFETCH_NTA(v); 2153 STORE_PS(tmps,XMM7); 2154 2155 /* Promote result from float to double */ 2156 CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps); 2157 } 2158 /* backward solve the upper triangular */ 2159 idt = 4*(n-1); 2160 ai16 = 16*diag[n-1]; 2161 v = aa + ai16 + 16; 2162 for (i=n-1; i>=0;){ 2163 PREFETCH_NTA(&v[8]); 2164 vi = aj + diag[i] + 1; 2165 nz = ai[i+1] - diag[i] - 1; 2166 2167 /* Demote accumulator from double to float */ 2168 CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]); 2169 LOAD_PS(tmps,XMM7); 2170 2171 while (nz--) { 2172 PREFETCH_NTA(&v[16]); 2173 idx = 4*(*vi++); 2174 2175 /* Demote solution (so far) from double to float */ 2176 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 2177 2178 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2179 SSE_INLINE_BEGIN_2(tmpx,v) 2180 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2181 2182 /* First Column */ 2183 SSE_COPY_PS(XMM0,XMM6) 2184 SSE_SHUFFLE(XMM0,XMM0,0x00) 2185 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2186 SSE_SUB_PS(XMM7,XMM0) 2187 2188 /* Second Column */ 2189 SSE_COPY_PS(XMM1,XMM6) 2190 SSE_SHUFFLE(XMM1,XMM1,0x55) 2191 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2192 SSE_SUB_PS(XMM7,XMM1) 2193 2194 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2195 2196 /* Third Column */ 2197 SSE_COPY_PS(XMM2,XMM6) 2198 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2199 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2200 SSE_SUB_PS(XMM7,XMM2) 2201 2202 /* Fourth Column */ 2203 SSE_COPY_PS(XMM3,XMM6) 2204 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2205 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2206 SSE_SUB_PS(XMM7,XMM3) 2207 SSE_INLINE_END_2 2208 v += 16; 2209 } 2210 v = aa + ai16; 2211 ai16 = 16*diag[--i]; 2212 PREFETCH_NTA(aa+ai16+16); 2213 /* 2214 Scale the result by the diagonal 4x4 block, 2215 which was inverted as part of the factorization 2216 */ 2217 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2218 /* First Column */ 2219 SSE_COPY_PS(XMM0,XMM7) 2220 SSE_SHUFFLE(XMM0,XMM0,0x00) 2221 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2222 2223 /* Second Column */ 2224 SSE_COPY_PS(XMM1,XMM7) 2225 SSE_SHUFFLE(XMM1,XMM1,0x55) 2226 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2227 SSE_ADD_PS(XMM0,XMM1) 2228 2229 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2230 2231 /* Third Column */ 2232 SSE_COPY_PS(XMM2,XMM7) 2233 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2234 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2235 SSE_ADD_PS(XMM0,XMM2) 2236 2237 /* Fourth Column */ 2238 SSE_COPY_PS(XMM3,XMM7) 2239 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2240 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2241 SSE_ADD_PS(XMM0,XMM3) 2242 2243 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2244 SSE_INLINE_END_3 2245 2246 /* Promote solution from float to double */ 2247 CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps); 2248 2249 v = aa + ai16 + 16; 2250 idt -= 4; 2251 } 2252 } 2253 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2254 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2255 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2256 SSE_SCOPE_END; 2257 PetscFunctionReturn(0); 2258 } 2259 2260 #endif 2261 #undef __FUNCT__ 2262 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2263 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2264 { 2265 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2266 IS iscol=a->col,isrow=a->row; 2267 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2268 int *diag = a->diag; 2269 MatScalar *aa=a->a,*v; 2270 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 2271 2272 PetscFunctionBegin; 2273 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2274 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2275 t = a->solve_work; 2276 2277 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2278 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2279 2280 /* forward solve the lower triangular */ 2281 idx = 3*(*r++); 2282 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2283 for (i=1; i<n; i++) { 2284 v = aa + 9*ai[i]; 2285 vi = aj + ai[i]; 2286 nz = diag[i] - ai[i]; 2287 idx = 3*(*r++); 2288 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2289 while (nz--) { 2290 idx = 3*(*vi++); 2291 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2292 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2293 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2294 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2295 v += 9; 2296 } 2297 idx = 3*i; 2298 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2299 } 2300 /* backward solve the upper triangular */ 2301 for (i=n-1; i>=0; i--){ 2302 v = aa + 9*diag[i] + 9; 2303 vi = aj + diag[i] + 1; 2304 nz = ai[i+1] - diag[i] - 1; 2305 idt = 3*i; 2306 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2307 while (nz--) { 2308 idx = 3*(*vi++); 2309 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2310 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2311 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2312 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2313 v += 9; 2314 } 2315 idc = 3*(*c--); 2316 v = aa + 9*diag[i]; 2317 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2318 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2319 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2320 } 2321 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2322 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2323 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2324 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2325 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2326 PetscFunctionReturn(0); 2327 } 2328 2329 /* 2330 Special case where the matrix was ILU(0) factored in the natural 2331 ordering. This eliminates the need for the column and row permutation. 2332 */ 2333 #undef __FUNCT__ 2334 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2335 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2336 { 2337 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2338 int n=a->mbs,*ai=a->i,*aj=a->j; 2339 int ierr,*diag = a->diag; 2340 MatScalar *aa=a->a,*v; 2341 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 2342 int jdx,idt,idx,nz,*vi,i; 2343 2344 PetscFunctionBegin; 2345 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2346 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2347 2348 2349 /* forward solve the lower triangular */ 2350 idx = 0; 2351 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2352 for (i=1; i<n; i++) { 2353 v = aa + 9*ai[i]; 2354 vi = aj + ai[i]; 2355 nz = diag[i] - ai[i]; 2356 idx += 3; 2357 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2358 while (nz--) { 2359 jdx = 3*(*vi++); 2360 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2361 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2362 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2363 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2364 v += 9; 2365 } 2366 x[idx] = s1; 2367 x[1+idx] = s2; 2368 x[2+idx] = s3; 2369 } 2370 /* backward solve the upper triangular */ 2371 for (i=n-1; i>=0; i--){ 2372 v = aa + 9*diag[i] + 9; 2373 vi = aj + diag[i] + 1; 2374 nz = ai[i+1] - diag[i] - 1; 2375 idt = 3*i; 2376 s1 = x[idt]; s2 = x[1+idt]; 2377 s3 = x[2+idt]; 2378 while (nz--) { 2379 idx = 3*(*vi++); 2380 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2381 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2382 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2383 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2384 v += 9; 2385 } 2386 v = aa + 9*diag[i]; 2387 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2388 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2389 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2390 } 2391 2392 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2393 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2394 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2395 PetscFunctionReturn(0); 2396 } 2397 2398 #undef __FUNCT__ 2399 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2400 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2401 { 2402 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2403 IS iscol=a->col,isrow=a->row; 2404 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2405 int *diag = a->diag; 2406 MatScalar *aa=a->a,*v; 2407 PetscScalar *x,*b,s1,s2,x1,x2,*t; 2408 2409 PetscFunctionBegin; 2410 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2411 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2412 t = a->solve_work; 2413 2414 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2415 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2416 2417 /* forward solve the lower triangular */ 2418 idx = 2*(*r++); 2419 t[0] = b[idx]; t[1] = b[1+idx]; 2420 for (i=1; i<n; i++) { 2421 v = aa + 4*ai[i]; 2422 vi = aj + ai[i]; 2423 nz = diag[i] - ai[i]; 2424 idx = 2*(*r++); 2425 s1 = b[idx]; s2 = b[1+idx]; 2426 while (nz--) { 2427 idx = 2*(*vi++); 2428 x1 = t[idx]; x2 = t[1+idx]; 2429 s1 -= v[0]*x1 + v[2]*x2; 2430 s2 -= v[1]*x1 + v[3]*x2; 2431 v += 4; 2432 } 2433 idx = 2*i; 2434 t[idx] = s1; t[1+idx] = s2; 2435 } 2436 /* backward solve the upper triangular */ 2437 for (i=n-1; i>=0; i--){ 2438 v = aa + 4*diag[i] + 4; 2439 vi = aj + diag[i] + 1; 2440 nz = ai[i+1] - diag[i] - 1; 2441 idt = 2*i; 2442 s1 = t[idt]; s2 = t[1+idt]; 2443 while (nz--) { 2444 idx = 2*(*vi++); 2445 x1 = t[idx]; x2 = t[1+idx]; 2446 s1 -= v[0]*x1 + v[2]*x2; 2447 s2 -= v[1]*x1 + v[3]*x2; 2448 v += 4; 2449 } 2450 idc = 2*(*c--); 2451 v = aa + 4*diag[i]; 2452 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2453 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2454 } 2455 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2456 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2457 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2458 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2459 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2460 PetscFunctionReturn(0); 2461 } 2462 2463 /* 2464 Special case where the matrix was ILU(0) factored in the natural 2465 ordering. This eliminates the need for the column and row permutation. 2466 */ 2467 #undef __FUNCT__ 2468 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2469 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2470 { 2471 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2472 int n=a->mbs,*ai=a->i,*aj=a->j; 2473 int ierr,*diag = a->diag; 2474 MatScalar *aa=a->a,*v; 2475 PetscScalar *x,*b,s1,s2,x1,x2; 2476 int jdx,idt,idx,nz,*vi,i; 2477 2478 PetscFunctionBegin; 2479 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2480 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2481 2482 /* forward solve the lower triangular */ 2483 idx = 0; 2484 x[0] = b[0]; x[1] = b[1]; 2485 for (i=1; i<n; i++) { 2486 v = aa + 4*ai[i]; 2487 vi = aj + ai[i]; 2488 nz = diag[i] - ai[i]; 2489 idx += 2; 2490 s1 = b[idx];s2 = b[1+idx]; 2491 while (nz--) { 2492 jdx = 2*(*vi++); 2493 x1 = x[jdx];x2 = x[1+jdx]; 2494 s1 -= v[0]*x1 + v[2]*x2; 2495 s2 -= v[1]*x1 + v[3]*x2; 2496 v += 4; 2497 } 2498 x[idx] = s1; 2499 x[1+idx] = s2; 2500 } 2501 /* backward solve the upper triangular */ 2502 for (i=n-1; i>=0; i--){ 2503 v = aa + 4*diag[i] + 4; 2504 vi = aj + diag[i] + 1; 2505 nz = ai[i+1] - diag[i] - 1; 2506 idt = 2*i; 2507 s1 = x[idt]; s2 = x[1+idt]; 2508 while (nz--) { 2509 idx = 2*(*vi++); 2510 x1 = x[idx]; x2 = x[1+idx]; 2511 s1 -= v[0]*x1 + v[2]*x2; 2512 s2 -= v[1]*x1 + v[3]*x2; 2513 v += 4; 2514 } 2515 v = aa + 4*diag[i]; 2516 x[idt] = v[0]*s1 + v[2]*s2; 2517 x[1+idt] = v[1]*s1 + v[3]*s2; 2518 } 2519 2520 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2521 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2522 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2523 PetscFunctionReturn(0); 2524 } 2525 2526 #undef __FUNCT__ 2527 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2528 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2529 { 2530 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2531 IS iscol=a->col,isrow=a->row; 2532 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2533 int *diag = a->diag; 2534 MatScalar *aa=a->a,*v; 2535 PetscScalar *x,*b,s1,*t; 2536 2537 PetscFunctionBegin; 2538 if (!n) PetscFunctionReturn(0); 2539 2540 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2541 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2542 t = a->solve_work; 2543 2544 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2545 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2546 2547 /* forward solve the lower triangular */ 2548 t[0] = b[*r++]; 2549 for (i=1; i<n; i++) { 2550 v = aa + ai[i]; 2551 vi = aj + ai[i]; 2552 nz = diag[i] - ai[i]; 2553 s1 = b[*r++]; 2554 while (nz--) { 2555 s1 -= (*v++)*t[*vi++]; 2556 } 2557 t[i] = s1; 2558 } 2559 /* backward solve the upper triangular */ 2560 for (i=n-1; i>=0; i--){ 2561 v = aa + diag[i] + 1; 2562 vi = aj + diag[i] + 1; 2563 nz = ai[i+1] - diag[i] - 1; 2564 s1 = t[i]; 2565 while (nz--) { 2566 s1 -= (*v++)*t[*vi++]; 2567 } 2568 x[*c--] = t[i] = aa[diag[i]]*s1; 2569 } 2570 2571 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2572 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2573 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2574 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2575 PetscLogFlops(2*1*(a->nz) - A->n); 2576 PetscFunctionReturn(0); 2577 } 2578 /* 2579 Special case where the matrix was ILU(0) factored in the natural 2580 ordering. This eliminates the need for the column and row permutation. 2581 */ 2582 #undef __FUNCT__ 2583 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 2584 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2585 { 2586 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2587 int n=a->mbs,*ai=a->i,*aj=a->j; 2588 int ierr,*diag = a->diag; 2589 MatScalar *aa=a->a; 2590 PetscScalar *x,*b; 2591 PetscScalar s1,x1; 2592 MatScalar *v; 2593 int jdx,idt,idx,nz,*vi,i; 2594 2595 PetscFunctionBegin; 2596 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2597 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2598 2599 /* forward solve the lower triangular */ 2600 idx = 0; 2601 x[0] = b[0]; 2602 for (i=1; i<n; i++) { 2603 v = aa + ai[i]; 2604 vi = aj + ai[i]; 2605 nz = diag[i] - ai[i]; 2606 idx += 1; 2607 s1 = b[idx]; 2608 while (nz--) { 2609 jdx = *vi++; 2610 x1 = x[jdx]; 2611 s1 -= v[0]*x1; 2612 v += 1; 2613 } 2614 x[idx] = s1; 2615 } 2616 /* backward solve the upper triangular */ 2617 for (i=n-1; i>=0; i--){ 2618 v = aa + diag[i] + 1; 2619 vi = aj + diag[i] + 1; 2620 nz = ai[i+1] - diag[i] - 1; 2621 idt = i; 2622 s1 = x[idt]; 2623 while (nz--) { 2624 idx = *vi++; 2625 x1 = x[idx]; 2626 s1 -= v[0]*x1; 2627 v += 1; 2628 } 2629 v = aa + diag[i]; 2630 x[idt] = v[0]*s1; 2631 } 2632 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2633 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2634 PetscLogFlops(2*(a->nz) - A->n); 2635 PetscFunctionReturn(0); 2636 } 2637 2638 /* ----------------------------------------------------------------*/ 2639 /* 2640 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 2641 except that the data structure of Mat_SeqAIJ is slightly different. 2642 Not a good example of code reuse. 2643 */ 2644 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat); 2645 2646 #undef __FUNCT__ 2647 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 2648 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact) 2649 { 2650 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 2651 IS isicol; 2652 int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j; 2653 int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 2654 int *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0; 2655 int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 2656 PetscTruth col_identity,row_identity; 2657 PetscReal f; 2658 2659 PetscFunctionBegin; 2660 if (info) { 2661 f = info->fill; 2662 levels = (int)info->levels; 2663 diagonal_fill = (int)info->diagonal_fill; 2664 } else { 2665 f = 1.0; 2666 levels = 0; 2667 diagonal_fill = 0; 2668 } 2669 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 2670 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 2671 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 2672 2673 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 2674 ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 2675 (*fact)->factor = FACTOR_LU; 2676 b = (Mat_SeqBAIJ*)(*fact)->data; 2677 if (!b->diag) { 2678 ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 2679 } 2680 ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 2681 b->row = isrow; 2682 b->col = iscol; 2683 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 2684 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 2685 b->icol = isicol; 2686 ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 2687 } else { /* general case perform the symbolic factorization */ 2688 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 2689 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 2690 2691 /* get new row pointers */ 2692 ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 2693 ainew[0] = 0; 2694 /* don't know how many column pointers are needed so estimate */ 2695 jmax = (int)(f*ai[n] + 1); 2696 ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 2697 /* ajfill is level of fill for each fill entry */ 2698 ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 2699 /* fill is a linked list of nonzeros in active row */ 2700 ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 2701 /* im is level for each filled value */ 2702 ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 2703 /* dloc is location of diagonal in factor */ 2704 ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 2705 dloc[0] = 0; 2706 for (prow=0; prow<n; prow++) { 2707 2708 /* copy prow into linked list */ 2709 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 2710 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 2711 xi = aj + ai[r[prow]]; 2712 fill[n] = n; 2713 fill[prow] = -1; /* marker for diagonal entry */ 2714 while (nz--) { 2715 fm = n; 2716 idx = ic[*xi++]; 2717 do { 2718 m = fm; 2719 fm = fill[m]; 2720 } while (fm < idx); 2721 fill[m] = idx; 2722 fill[idx] = fm; 2723 im[idx] = 0; 2724 } 2725 2726 /* make sure diagonal entry is included */ 2727 if (diagonal_fill && fill[prow] == -1) { 2728 fm = n; 2729 while (fill[fm] < prow) fm = fill[fm]; 2730 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 2731 fill[fm] = prow; 2732 im[prow] = 0; 2733 nzf++; 2734 dcount++; 2735 } 2736 2737 nzi = 0; 2738 row = fill[n]; 2739 while (row < prow) { 2740 incrlev = im[row] + 1; 2741 nz = dloc[row]; 2742 xi = ajnew + ainew[row] + nz + 1; 2743 flev = ajfill + ainew[row] + nz + 1; 2744 nnz = ainew[row+1] - ainew[row] - nz - 1; 2745 fm = row; 2746 while (nnz-- > 0) { 2747 idx = *xi++; 2748 if (*flev + incrlev > levels) { 2749 flev++; 2750 continue; 2751 } 2752 do { 2753 m = fm; 2754 fm = fill[m]; 2755 } while (fm < idx); 2756 if (fm != idx) { 2757 im[idx] = *flev + incrlev; 2758 fill[m] = idx; 2759 fill[idx] = fm; 2760 fm = idx; 2761 nzf++; 2762 } else { 2763 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 2764 } 2765 flev++; 2766 } 2767 row = fill[row]; 2768 nzi++; 2769 } 2770 /* copy new filled row into permanent storage */ 2771 ainew[prow+1] = ainew[prow] + nzf; 2772 if (ainew[prow+1] > jmax) { 2773 2774 /* estimate how much additional space we will need */ 2775 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2776 /* just double the memory each time */ 2777 int maxadd = jmax; 2778 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 2779 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 2780 jmax += maxadd; 2781 2782 /* allocate a longer ajnew and ajfill */ 2783 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 2784 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 2785 ierr = PetscFree(ajnew);CHKERRQ(ierr); 2786 ajnew = xi; 2787 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 2788 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 2789 ierr = PetscFree(ajfill);CHKERRQ(ierr); 2790 ajfill = xi; 2791 realloc++; /* count how many reallocations are needed */ 2792 } 2793 xi = ajnew + ainew[prow]; 2794 flev = ajfill + ainew[prow]; 2795 dloc[prow] = nzi; 2796 fm = fill[n]; 2797 while (nzf--) { 2798 *xi++ = fm; 2799 *flev++ = im[fm]; 2800 fm = fill[fm]; 2801 } 2802 /* make sure row has diagonal entry */ 2803 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 2804 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 2805 try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 2806 } 2807 } 2808 ierr = PetscFree(ajfill);CHKERRQ(ierr); 2809 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2810 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2811 ierr = PetscFree(fill);CHKERRQ(ierr); 2812 ierr = PetscFree(im);CHKERRQ(ierr); 2813 2814 { 2815 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 2816 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 2817 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 2818 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 2819 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 2820 if (diagonal_fill) { 2821 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replace %d missing diagonals",dcount); 2822 } 2823 } 2824 2825 /* put together the new matrix */ 2826 ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr); 2827 PetscLogObjectParent(*fact,isicol); 2828 b = (Mat_SeqBAIJ*)(*fact)->data; 2829 ierr = PetscFree(b->imax);CHKERRQ(ierr); 2830 b->singlemalloc = PETSC_FALSE; 2831 len = bs2*ainew[n]*sizeof(MatScalar); 2832 /* the next line frees the default space generated by the Create() */ 2833 ierr = PetscFree(b->a);CHKERRQ(ierr); 2834 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 2835 ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 2836 b->j = ajnew; 2837 b->i = ainew; 2838 for (i=0; i<n; i++) dloc[i] += ainew[i]; 2839 b->diag = dloc; 2840 b->ilen = 0; 2841 b->imax = 0; 2842 b->row = isrow; 2843 b->col = iscol; 2844 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 2845 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 2846 b->icol = isicol; 2847 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 2848 /* In b structure: Free imax, ilen, old a, old j. 2849 Allocate dloc, solve_work, new a, new j */ 2850 PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar)); 2851 b->maxnz = b->nz = ainew[n]; 2852 (*fact)->factor = FACTOR_LU; 2853 2854 (*fact)->info.factor_mallocs = realloc; 2855 (*fact)->info.fill_ratio_given = f; 2856 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 2857 } 2858 2859 if (row_identity && col_identity) { 2860 ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 2861 } 2862 PetscFunctionReturn(0); 2863 } 2864 2865 #undef __FUNCT__ 2866 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 2867 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 2868 { 2869 /* 2870 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 2871 with natural ordering 2872 */ 2873 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 2874 PetscTruth sse_enabled_local,sse_enabled_global; 2875 int ierr; 2876 2877 PetscFunctionBegin; 2878 inA->ops->solve = MatSolve_SeqBAIJ_Update; 2879 inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 2880 switch (a->bs) { 2881 case 1: 2882 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 2883 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"); 2884 break; 2885 case 2: 2886 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 2887 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"); 2888 break; 2889 case 3: 2890 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 2891 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"); 2892 break; 2893 case 4: 2894 ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr); 2895 if (sse_enabled_local) { 2896 #if defined(PETSC_HAVE_SSE) 2897 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 2898 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4\n"); 2899 #else 2900 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 2901 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 2902 #endif 2903 } else { 2904 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 2905 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 2906 } 2907 break; 2908 case 5: 2909 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 2910 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"); 2911 break; 2912 case 6: 2913 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 2914 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"); 2915 break; 2916 case 7: 2917 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 2918 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"); 2919 break; 2920 } 2921 PetscFunctionReturn(0); 2922 } 2923 2924 #undef __FUNCT__ 2925 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 2926 int MatSeqBAIJ_UpdateSolvers(Mat A) 2927 { 2928 /* 2929 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 2930 with natural ordering 2931 */ 2932 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2933 IS row = a->row, col = a->col; 2934 PetscTruth row_identity, col_identity; 2935 PetscTruth sse_enabled_local, sse_enabled_global; 2936 PetscTruth use_single, use_natural; 2937 int ierr; 2938 2939 PetscFunctionBegin; 2940 2941 use_single = PETSC_FALSE; 2942 use_natural = PETSC_FALSE; 2943 2944 ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 2945 ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 2946 2947 if (row_identity && col_identity) { 2948 use_natural = PETSC_TRUE; 2949 } else { 2950 use_natural = PETSC_FALSE; 2951 } 2952 switch (a->bs) { 2953 case 1: 2954 if (use_natural) { 2955 A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 2956 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 2957 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"); 2958 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 2959 } else { 2960 A->ops->solve = MatSolve_SeqBAIJ_1; 2961 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 2962 } 2963 break; 2964 case 2: 2965 if (use_natural) { 2966 A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 2967 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 2968 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"); 2969 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 2970 } else { 2971 A->ops->solve = MatSolve_SeqBAIJ_2; 2972 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 2973 } 2974 break; 2975 case 3: 2976 if (use_natural) { 2977 A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 2978 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 2979 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"); 2980 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 2981 } else { 2982 A->ops->solve = MatSolve_SeqBAIJ_3; 2983 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 2984 } 2985 break; 2986 case 4: 2987 ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr); 2988 if (sse_enabled_local) { 2989 PetscTruth single_prec,flg; 2990 single_prec = flg = PETSC_FALSE; 2991 ierr = PetscOptionsGetLogical(PETSC_NULL,"-mat_single_precision_solves",&single_prec,&flg);CHKERRQ(ierr); 2992 if (flg) { 2993 a->single_precision_solves = single_prec; 2994 } 2995 if (a->single_precision_solves) { 2996 use_single = PETSC_TRUE; 2997 } 2998 } else { 2999 use_single = PETSC_FALSE; 3000 } 3001 if (use_natural) { 3002 if (use_single) { 3003 #if defined(PETSC_HAVE_SSE) 3004 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 3005 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place natural ordering solve BS=4\n"); 3006 #else 3007 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3008 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3009 #endif 3010 } else { 3011 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3012 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3013 } 3014 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3015 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3016 } else { 3017 if (use_single) { 3018 #if defined(PETSC_HAVE_SSE) 3019 A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3020 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"); 3021 #else 3022 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3023 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3024 #endif 3025 } else { 3026 A->ops->solve = MatSolve_SeqBAIJ_4; 3027 } 3028 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3029 } 3030 break; 3031 case 5: 3032 if (use_natural) { 3033 A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3034 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3035 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3036 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3037 } else { 3038 A->ops->solve = MatSolve_SeqBAIJ_5; 3039 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3040 } 3041 break; 3042 case 6: 3043 if (use_natural) { 3044 A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3045 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3046 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3047 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3048 } else { 3049 A->ops->solve = MatSolve_SeqBAIJ_6; 3050 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3051 } 3052 break; 3053 case 7: 3054 if (use_natural) { 3055 A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3056 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3057 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3058 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3059 } else { 3060 A->ops->solve = MatSolve_SeqBAIJ_7; 3061 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3062 } 3063 break; 3064 default: 3065 A->ops->solve = MatSolve_SeqBAIJ_N; 3066 break; 3067 } 3068 PetscFunctionReturn(0); 3069 } 3070 3071 #undef __FUNCT__ 3072 #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3073 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3074 int ierr; 3075 3076 PetscFunctionBegin; 3077 ierr = MatSeqBAIJ_UpdateSolvers(A); 3078 if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3079 ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3080 } else { 3081 SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3082 } 3083 PetscFunctionReturn(0); 3084 } 3085 3086 #undef __FUNCT__ 3087 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3088 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3089 int ierr; 3090 3091 PetscFunctionBegin; 3092 ierr = MatSeqBAIJ_UpdateSolvers(A); 3093 ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3094 PetscFunctionReturn(0); 3095 } 3096 3097 3098