1 /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/ 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include "src/mat/impls/baij/seq/baij.h" 7 #include "src/vec/vecimpl.h" 8 #include "src/inline/ilu.h" 9 #include "src/inline/dot.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 17 int *diag = a->diag; 18 MatScalar *aa=a->a,*v; 19 PetscScalar s1,*x,*b; 20 21 PetscFunctionBegin; 22 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 23 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 24 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25 26 /* forward solve the U^T */ 27 for (i=0; i<n; i++) { 28 29 v = aa + diag[i]; 30 /* multiply by the inverse of the block diagonal */ 31 s1 = (*v++)*x[i]; 32 vi = aj + diag[i] + 1; 33 nz = ai[i+1] - diag[i] - 1; 34 while (nz--) { 35 x[*vi++] -= (*v++)*s1; 36 } 37 x[i] = s1; 38 } 39 /* backward solve the L^T */ 40 for (i=n-1; i>=0; i--){ 41 v = aa + diag[i] - 1; 42 vi = aj + diag[i] - 1; 43 nz = diag[i] - ai[i]; 44 s1 = x[i]; 45 while (nz--) { 46 x[*vi--] -= (*v--)*s1; 47 } 48 } 49 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 51 PetscLogFlops(2*(a->nz) - A->n); 52 PetscFunctionReturn(0); 53 } 54 55 #undef __FUNCT__ 56 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 57 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 58 { 59 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 60 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 61 int *diag = a->diag,oidx; 62 MatScalar *aa=a->a,*v; 63 PetscScalar s1,s2,x1,x2; 64 PetscScalar *x,*b; 65 66 PetscFunctionBegin; 67 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 68 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 69 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 70 71 /* forward solve the U^T */ 72 idx = 0; 73 for (i=0; i<n; i++) { 74 75 v = aa + 4*diag[i]; 76 /* multiply by the inverse of the block diagonal */ 77 x1 = x[idx]; x2 = x[1+idx]; 78 s1 = v[0]*x1 + v[1]*x2; 79 s2 = v[2]*x1 + v[3]*x2; 80 v += 4; 81 82 vi = aj + diag[i] + 1; 83 nz = ai[i+1] - diag[i] - 1; 84 while (nz--) { 85 oidx = 2*(*vi++); 86 x[oidx] -= v[0]*s1 + v[1]*s2; 87 x[oidx+1] -= v[2]*s1 + v[3]*s2; 88 v += 4; 89 } 90 x[idx] = s1;x[1+idx] = s2; 91 idx += 2; 92 } 93 /* backward solve the L^T */ 94 for (i=n-1; i>=0; i--){ 95 v = aa + 4*diag[i] - 4; 96 vi = aj + diag[i] - 1; 97 nz = diag[i] - ai[i]; 98 idt = 2*i; 99 s1 = x[idt]; s2 = x[1+idt]; 100 while (nz--) { 101 idx = 2*(*vi--); 102 x[idx] -= v[0]*s1 + v[1]*s2; 103 x[idx+1] -= v[2]*s1 + v[3]*s2; 104 v -= 4; 105 } 106 } 107 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 108 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 109 PetscLogFlops(2*4*(a->nz) - 2*A->n); 110 PetscFunctionReturn(0); 111 } 112 113 #undef __FUNCT__ 114 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 115 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 116 { 117 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 118 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 119 int *diag = a->diag,oidx; 120 MatScalar *aa=a->a,*v; 121 PetscScalar s1,s2,s3,x1,x2,x3; 122 PetscScalar *x,*b; 123 124 PetscFunctionBegin; 125 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 126 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 127 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 128 129 /* forward solve the U^T */ 130 idx = 0; 131 for (i=0; i<n; i++) { 132 133 v = aa + 9*diag[i]; 134 /* multiply by the inverse of the block diagonal */ 135 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 136 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 137 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 138 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 139 v += 9; 140 141 vi = aj + diag[i] + 1; 142 nz = ai[i+1] - diag[i] - 1; 143 while (nz--) { 144 oidx = 3*(*vi++); 145 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 146 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 147 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 148 v += 9; 149 } 150 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 151 idx += 3; 152 } 153 /* backward solve the L^T */ 154 for (i=n-1; i>=0; i--){ 155 v = aa + 9*diag[i] - 9; 156 vi = aj + diag[i] - 1; 157 nz = diag[i] - ai[i]; 158 idt = 3*i; 159 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 160 while (nz--) { 161 idx = 3*(*vi--); 162 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 163 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 164 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 165 v -= 9; 166 } 167 } 168 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 170 PetscLogFlops(2*9*(a->nz) - 3*A->n); 171 PetscFunctionReturn(0); 172 } 173 174 #undef __FUNCT__ 175 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 176 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 177 { 178 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 179 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 180 int *diag = a->diag,oidx; 181 MatScalar *aa=a->a,*v; 182 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 183 PetscScalar *x,*b; 184 185 PetscFunctionBegin; 186 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 187 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 188 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 189 190 /* forward solve the U^T */ 191 idx = 0; 192 for (i=0; i<n; i++) { 193 194 v = aa + 16*diag[i]; 195 /* multiply by the inverse of the block diagonal */ 196 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 197 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 198 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 199 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 200 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 201 v += 16; 202 203 vi = aj + diag[i] + 1; 204 nz = ai[i+1] - diag[i] - 1; 205 while (nz--) { 206 oidx = 4*(*vi++); 207 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 208 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 209 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 210 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 211 v += 16; 212 } 213 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 214 idx += 4; 215 } 216 /* backward solve the L^T */ 217 for (i=n-1; i>=0; i--){ 218 v = aa + 16*diag[i] - 16; 219 vi = aj + diag[i] - 1; 220 nz = diag[i] - ai[i]; 221 idt = 4*i; 222 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 223 while (nz--) { 224 idx = 4*(*vi--); 225 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 226 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 227 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 228 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 229 v -= 16; 230 } 231 } 232 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 233 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234 PetscLogFlops(2*16*(a->nz) - 4*A->n); 235 PetscFunctionReturn(0); 236 } 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 240 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 241 { 242 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 243 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 244 int *diag = a->diag,oidx; 245 MatScalar *aa=a->a,*v; 246 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 247 PetscScalar *x,*b; 248 249 PetscFunctionBegin; 250 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 251 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 252 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 253 254 /* forward solve the U^T */ 255 idx = 0; 256 for (i=0; i<n; i++) { 257 258 v = aa + 25*diag[i]; 259 /* multiply by the inverse of the block diagonal */ 260 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 261 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 262 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 263 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 264 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 265 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 266 v += 25; 267 268 vi = aj + diag[i] + 1; 269 nz = ai[i+1] - diag[i] - 1; 270 while (nz--) { 271 oidx = 5*(*vi++); 272 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 273 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 274 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 275 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 276 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 277 v += 25; 278 } 279 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 280 idx += 5; 281 } 282 /* backward solve the L^T */ 283 for (i=n-1; i>=0; i--){ 284 v = aa + 25*diag[i] - 25; 285 vi = aj + diag[i] - 1; 286 nz = diag[i] - ai[i]; 287 idt = 5*i; 288 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 289 while (nz--) { 290 idx = 5*(*vi--); 291 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 292 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 293 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 294 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 295 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 296 v -= 25; 297 } 298 } 299 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 300 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 301 PetscLogFlops(2*25*(a->nz) - 5*A->n); 302 PetscFunctionReturn(0); 303 } 304 305 #undef __FUNCT__ 306 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 307 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 308 { 309 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 310 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 311 int *diag = a->diag,oidx; 312 MatScalar *aa=a->a,*v; 313 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 314 PetscScalar *x,*b; 315 316 PetscFunctionBegin; 317 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 318 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 319 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 320 321 /* forward solve the U^T */ 322 idx = 0; 323 for (i=0; i<n; i++) { 324 325 v = aa + 36*diag[i]; 326 /* multiply by the inverse of the block diagonal */ 327 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 328 x6 = x[5+idx]; 329 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 330 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 331 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 332 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 333 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 334 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 335 v += 36; 336 337 vi = aj + diag[i] + 1; 338 nz = ai[i+1] - diag[i] - 1; 339 while (nz--) { 340 oidx = 6*(*vi++); 341 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 342 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 343 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 344 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 345 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 346 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 347 v += 36; 348 } 349 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 350 x[5+idx] = s6; 351 idx += 6; 352 } 353 /* backward solve the L^T */ 354 for (i=n-1; i>=0; i--){ 355 v = aa + 36*diag[i] - 36; 356 vi = aj + diag[i] - 1; 357 nz = diag[i] - ai[i]; 358 idt = 6*i; 359 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 360 s6 = x[5+idt]; 361 while (nz--) { 362 idx = 6*(*vi--); 363 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 364 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 365 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 366 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 367 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 368 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 369 v -= 36; 370 } 371 } 372 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 373 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 374 PetscLogFlops(2*36*(a->nz) - 6*A->n); 375 PetscFunctionReturn(0); 376 } 377 378 #undef __FUNCT__ 379 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 380 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 381 { 382 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 383 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 384 int *diag = a->diag,oidx; 385 MatScalar *aa=a->a,*v; 386 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 387 PetscScalar *x,*b; 388 389 PetscFunctionBegin; 390 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 391 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 392 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 393 394 /* forward solve the U^T */ 395 idx = 0; 396 for (i=0; i<n; i++) { 397 398 v = aa + 49*diag[i]; 399 /* multiply by the inverse of the block diagonal */ 400 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 401 x6 = x[5+idx]; x7 = x[6+idx]; 402 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 403 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 404 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 405 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 406 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 407 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 408 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 409 v += 49; 410 411 vi = aj + diag[i] + 1; 412 nz = ai[i+1] - diag[i] - 1; 413 while (nz--) { 414 oidx = 7*(*vi++); 415 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 416 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 417 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 418 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 419 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 420 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 421 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 422 v += 49; 423 } 424 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 425 x[5+idx] = s6;x[6+idx] = s7; 426 idx += 7; 427 } 428 /* backward solve the L^T */ 429 for (i=n-1; i>=0; i--){ 430 v = aa + 49*diag[i] - 49; 431 vi = aj + diag[i] - 1; 432 nz = diag[i] - ai[i]; 433 idt = 7*i; 434 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 435 s6 = x[5+idt];s7 = x[6+idt]; 436 while (nz--) { 437 idx = 7*(*vi--); 438 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 439 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 440 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 441 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 442 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 443 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 444 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 445 v -= 49; 446 } 447 } 448 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 449 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 450 PetscLogFlops(2*49*(a->nz) - 7*A->n); 451 PetscFunctionReturn(0); 452 } 453 454 /*---------------------------------------------------------------------------------------------*/ 455 #undef __FUNCT__ 456 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 457 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 458 { 459 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 460 IS iscol=a->col,isrow=a->row; 461 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 462 int *diag = a->diag; 463 MatScalar *aa=a->a,*v; 464 PetscScalar s1,*x,*b,*t; 465 466 PetscFunctionBegin; 467 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 468 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 469 t = a->solve_work; 470 471 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 472 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 473 474 /* copy the b into temp work space according to permutation */ 475 for (i=0; i<n; i++) { 476 t[i] = b[c[i]]; 477 } 478 479 /* forward solve the U^T */ 480 for (i=0; i<n; i++) { 481 482 v = aa + diag[i]; 483 /* multiply by the inverse of the block diagonal */ 484 s1 = (*v++)*t[i]; 485 vi = aj + diag[i] + 1; 486 nz = ai[i+1] - diag[i] - 1; 487 while (nz--) { 488 t[*vi++] -= (*v++)*s1; 489 } 490 t[i] = s1; 491 } 492 /* backward solve the L^T */ 493 for (i=n-1; i>=0; i--){ 494 v = aa + diag[i] - 1; 495 vi = aj + diag[i] - 1; 496 nz = diag[i] - ai[i]; 497 s1 = t[i]; 498 while (nz--) { 499 t[*vi--] -= (*v--)*s1; 500 } 501 } 502 503 /* copy t into x according to permutation */ 504 for (i=0; i<n; i++) { 505 x[r[i]] = t[i]; 506 } 507 508 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 509 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 510 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 511 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 512 PetscLogFlops(2*(a->nz) - A->n); 513 PetscFunctionReturn(0); 514 } 515 516 #undef __FUNCT__ 517 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 518 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 519 { 520 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 521 IS iscol=a->col,isrow=a->row; 522 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 523 int *diag = a->diag,ii,ic,ir,oidx; 524 MatScalar *aa=a->a,*v; 525 PetscScalar s1,s2,x1,x2; 526 PetscScalar *x,*b,*t; 527 528 PetscFunctionBegin; 529 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 530 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 531 t = a->solve_work; 532 533 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 534 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 535 536 /* copy the b into temp work space according to permutation */ 537 ii = 0; 538 for (i=0; i<n; i++) { 539 ic = 2*c[i]; 540 t[ii] = b[ic]; 541 t[ii+1] = b[ic+1]; 542 ii += 2; 543 } 544 545 /* forward solve the U^T */ 546 idx = 0; 547 for (i=0; i<n; i++) { 548 549 v = aa + 4*diag[i]; 550 /* multiply by the inverse of the block diagonal */ 551 x1 = t[idx]; x2 = t[1+idx]; 552 s1 = v[0]*x1 + v[1]*x2; 553 s2 = v[2]*x1 + v[3]*x2; 554 v += 4; 555 556 vi = aj + diag[i] + 1; 557 nz = ai[i+1] - diag[i] - 1; 558 while (nz--) { 559 oidx = 2*(*vi++); 560 t[oidx] -= v[0]*s1 + v[1]*s2; 561 t[oidx+1] -= v[2]*s1 + v[3]*s2; 562 v += 4; 563 } 564 t[idx] = s1;t[1+idx] = s2; 565 idx += 2; 566 } 567 /* backward solve the L^T */ 568 for (i=n-1; i>=0; i--){ 569 v = aa + 4*diag[i] - 4; 570 vi = aj + diag[i] - 1; 571 nz = diag[i] - ai[i]; 572 idt = 2*i; 573 s1 = t[idt]; s2 = t[1+idt]; 574 while (nz--) { 575 idx = 2*(*vi--); 576 t[idx] -= v[0]*s1 + v[1]*s2; 577 t[idx+1] -= v[2]*s1 + v[3]*s2; 578 v -= 4; 579 } 580 } 581 582 /* copy t into x according to permutation */ 583 ii = 0; 584 for (i=0; i<n; i++) { 585 ir = 2*r[i]; 586 x[ir] = t[ii]; 587 x[ir+1] = t[ii+1]; 588 ii += 2; 589 } 590 591 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 592 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 593 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 594 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 595 PetscLogFlops(2*4*(a->nz) - 2*A->n); 596 PetscFunctionReturn(0); 597 } 598 599 #undef __FUNCT__ 600 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 601 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 602 { 603 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 604 IS iscol=a->col,isrow=a->row; 605 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 606 int *diag = a->diag,ii,ic,ir,oidx; 607 MatScalar *aa=a->a,*v; 608 PetscScalar s1,s2,s3,x1,x2,x3; 609 PetscScalar *x,*b,*t; 610 611 PetscFunctionBegin; 612 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 613 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 614 t = a->solve_work; 615 616 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 617 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 618 619 /* copy the b into temp work space according to permutation */ 620 ii = 0; 621 for (i=0; i<n; i++) { 622 ic = 3*c[i]; 623 t[ii] = b[ic]; 624 t[ii+1] = b[ic+1]; 625 t[ii+2] = b[ic+2]; 626 ii += 3; 627 } 628 629 /* forward solve the U^T */ 630 idx = 0; 631 for (i=0; i<n; i++) { 632 633 v = aa + 9*diag[i]; 634 /* multiply by the inverse of the block diagonal */ 635 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 636 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 637 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 638 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 639 v += 9; 640 641 vi = aj + diag[i] + 1; 642 nz = ai[i+1] - diag[i] - 1; 643 while (nz--) { 644 oidx = 3*(*vi++); 645 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 646 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 647 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 648 v += 9; 649 } 650 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 651 idx += 3; 652 } 653 /* backward solve the L^T */ 654 for (i=n-1; i>=0; i--){ 655 v = aa + 9*diag[i] - 9; 656 vi = aj + diag[i] - 1; 657 nz = diag[i] - ai[i]; 658 idt = 3*i; 659 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 660 while (nz--) { 661 idx = 3*(*vi--); 662 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 663 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 664 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 665 v -= 9; 666 } 667 } 668 669 /* copy t into x according to permutation */ 670 ii = 0; 671 for (i=0; i<n; i++) { 672 ir = 3*r[i]; 673 x[ir] = t[ii]; 674 x[ir+1] = t[ii+1]; 675 x[ir+2] = t[ii+2]; 676 ii += 3; 677 } 678 679 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 680 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 681 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 682 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 683 PetscLogFlops(2*9*(a->nz) - 3*A->n); 684 PetscFunctionReturn(0); 685 } 686 687 #undef __FUNCT__ 688 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 689 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 690 { 691 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 692 IS iscol=a->col,isrow=a->row; 693 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 694 int *diag = a->diag,ii,ic,ir,oidx; 695 MatScalar *aa=a->a,*v; 696 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 697 PetscScalar *x,*b,*t; 698 699 PetscFunctionBegin; 700 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 701 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 702 t = a->solve_work; 703 704 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 705 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 706 707 /* copy the b into temp work space according to permutation */ 708 ii = 0; 709 for (i=0; i<n; i++) { 710 ic = 4*c[i]; 711 t[ii] = b[ic]; 712 t[ii+1] = b[ic+1]; 713 t[ii+2] = b[ic+2]; 714 t[ii+3] = b[ic+3]; 715 ii += 4; 716 } 717 718 /* forward solve the U^T */ 719 idx = 0; 720 for (i=0; i<n; i++) { 721 722 v = aa + 16*diag[i]; 723 /* multiply by the inverse of the block diagonal */ 724 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 725 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 726 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 727 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 728 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 729 v += 16; 730 731 vi = aj + diag[i] + 1; 732 nz = ai[i+1] - diag[i] - 1; 733 while (nz--) { 734 oidx = 4*(*vi++); 735 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 736 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 737 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 738 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 739 v += 16; 740 } 741 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 742 idx += 4; 743 } 744 /* backward solve the L^T */ 745 for (i=n-1; i>=0; i--){ 746 v = aa + 16*diag[i] - 16; 747 vi = aj + diag[i] - 1; 748 nz = diag[i] - ai[i]; 749 idt = 4*i; 750 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 751 while (nz--) { 752 idx = 4*(*vi--); 753 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 754 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 755 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 756 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 757 v -= 16; 758 } 759 } 760 761 /* copy t into x according to permutation */ 762 ii = 0; 763 for (i=0; i<n; i++) { 764 ir = 4*r[i]; 765 x[ir] = t[ii]; 766 x[ir+1] = t[ii+1]; 767 x[ir+2] = t[ii+2]; 768 x[ir+3] = t[ii+3]; 769 ii += 4; 770 } 771 772 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 773 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 774 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 775 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 776 PetscLogFlops(2*16*(a->nz) - 4*A->n); 777 PetscFunctionReturn(0); 778 } 779 780 #undef __FUNCT__ 781 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 782 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 783 { 784 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 785 IS iscol=a->col,isrow=a->row; 786 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 787 int *diag = a->diag,ii,ic,ir,oidx; 788 MatScalar *aa=a->a,*v; 789 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 790 PetscScalar *x,*b,*t; 791 792 PetscFunctionBegin; 793 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 794 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 795 t = a->solve_work; 796 797 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 798 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 799 800 /* copy the b into temp work space according to permutation */ 801 ii = 0; 802 for (i=0; i<n; i++) { 803 ic = 5*c[i]; 804 t[ii] = b[ic]; 805 t[ii+1] = b[ic+1]; 806 t[ii+2] = b[ic+2]; 807 t[ii+3] = b[ic+3]; 808 t[ii+4] = b[ic+4]; 809 ii += 5; 810 } 811 812 /* forward solve the U^T */ 813 idx = 0; 814 for (i=0; i<n; i++) { 815 816 v = aa + 25*diag[i]; 817 /* multiply by the inverse of the block diagonal */ 818 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 819 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 820 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 821 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 822 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 823 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 824 v += 25; 825 826 vi = aj + diag[i] + 1; 827 nz = ai[i+1] - diag[i] - 1; 828 while (nz--) { 829 oidx = 5*(*vi++); 830 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 831 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 832 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 833 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 834 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 835 v += 25; 836 } 837 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 838 idx += 5; 839 } 840 /* backward solve the L^T */ 841 for (i=n-1; i>=0; i--){ 842 v = aa + 25*diag[i] - 25; 843 vi = aj + diag[i] - 1; 844 nz = diag[i] - ai[i]; 845 idt = 5*i; 846 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 847 while (nz--) { 848 idx = 5*(*vi--); 849 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854 v -= 25; 855 } 856 } 857 858 /* copy t into x according to permutation */ 859 ii = 0; 860 for (i=0; i<n; i++) { 861 ir = 5*r[i]; 862 x[ir] = t[ii]; 863 x[ir+1] = t[ii+1]; 864 x[ir+2] = t[ii+2]; 865 x[ir+3] = t[ii+3]; 866 x[ir+4] = t[ii+4]; 867 ii += 5; 868 } 869 870 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 871 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 872 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 873 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 874 PetscLogFlops(2*25*(a->nz) - 5*A->n); 875 PetscFunctionReturn(0); 876 } 877 878 #undef __FUNCT__ 879 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 880 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 881 { 882 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 883 IS iscol=a->col,isrow=a->row; 884 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 885 int *diag = a->diag,ii,ic,ir,oidx; 886 MatScalar *aa=a->a,*v; 887 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 888 PetscScalar *x,*b,*t; 889 890 PetscFunctionBegin; 891 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 892 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 893 t = a->solve_work; 894 895 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 896 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 897 898 /* copy the b into temp work space according to permutation */ 899 ii = 0; 900 for (i=0; i<n; i++) { 901 ic = 6*c[i]; 902 t[ii] = b[ic]; 903 t[ii+1] = b[ic+1]; 904 t[ii+2] = b[ic+2]; 905 t[ii+3] = b[ic+3]; 906 t[ii+4] = b[ic+4]; 907 t[ii+5] = b[ic+5]; 908 ii += 6; 909 } 910 911 /* forward solve the U^T */ 912 idx = 0; 913 for (i=0; i<n; i++) { 914 915 v = aa + 36*diag[i]; 916 /* multiply by the inverse of the block diagonal */ 917 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 918 x6 = t[5+idx]; 919 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 920 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 921 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 922 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 923 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 924 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 925 v += 36; 926 927 vi = aj + diag[i] + 1; 928 nz = ai[i+1] - diag[i] - 1; 929 while (nz--) { 930 oidx = 6*(*vi++); 931 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 932 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 933 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 934 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 935 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 936 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 937 v += 36; 938 } 939 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 940 t[5+idx] = s6; 941 idx += 6; 942 } 943 /* backward solve the L^T */ 944 for (i=n-1; i>=0; i--){ 945 v = aa + 36*diag[i] - 36; 946 vi = aj + diag[i] - 1; 947 nz = diag[i] - ai[i]; 948 idt = 6*i; 949 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 950 s6 = t[5+idt]; 951 while (nz--) { 952 idx = 6*(*vi--); 953 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 954 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 955 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 956 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 957 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 958 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 959 v -= 36; 960 } 961 } 962 963 /* copy t into x according to permutation */ 964 ii = 0; 965 for (i=0; i<n; i++) { 966 ir = 6*r[i]; 967 x[ir] = t[ii]; 968 x[ir+1] = t[ii+1]; 969 x[ir+2] = t[ii+2]; 970 x[ir+3] = t[ii+3]; 971 x[ir+4] = t[ii+4]; 972 x[ir+5] = t[ii+5]; 973 ii += 6; 974 } 975 976 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 977 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 978 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 979 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 980 PetscLogFlops(2*36*(a->nz) - 6*A->n); 981 PetscFunctionReturn(0); 982 } 983 984 #undef __FUNCT__ 985 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 986 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 987 { 988 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 989 IS iscol=a->col,isrow=a->row; 990 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 991 int *diag = a->diag,ii,ic,ir,oidx; 992 MatScalar *aa=a->a,*v; 993 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 994 PetscScalar *x,*b,*t; 995 996 PetscFunctionBegin; 997 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 998 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 999 t = a->solve_work; 1000 1001 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1002 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1003 1004 /* copy the b into temp work space according to permutation */ 1005 ii = 0; 1006 for (i=0; i<n; i++) { 1007 ic = 7*c[i]; 1008 t[ii] = b[ic]; 1009 t[ii+1] = b[ic+1]; 1010 t[ii+2] = b[ic+2]; 1011 t[ii+3] = b[ic+3]; 1012 t[ii+4] = b[ic+4]; 1013 t[ii+5] = b[ic+5]; 1014 t[ii+6] = b[ic+6]; 1015 ii += 7; 1016 } 1017 1018 /* forward solve the U^T */ 1019 idx = 0; 1020 for (i=0; i<n; i++) { 1021 1022 v = aa + 49*diag[i]; 1023 /* multiply by the inverse of the block diagonal */ 1024 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1025 x6 = t[5+idx]; x7 = t[6+idx]; 1026 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1027 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1028 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1029 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1030 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1031 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1032 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1033 v += 49; 1034 1035 vi = aj + diag[i] + 1; 1036 nz = ai[i+1] - diag[i] - 1; 1037 while (nz--) { 1038 oidx = 7*(*vi++); 1039 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1040 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1041 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1042 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1043 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1044 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1045 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1046 v += 49; 1047 } 1048 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1049 t[5+idx] = s6;t[6+idx] = s7; 1050 idx += 7; 1051 } 1052 /* backward solve the L^T */ 1053 for (i=n-1; i>=0; i--){ 1054 v = aa + 49*diag[i] - 49; 1055 vi = aj + diag[i] - 1; 1056 nz = diag[i] - ai[i]; 1057 idt = 7*i; 1058 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1059 s6 = t[5+idt];s7 = t[6+idt]; 1060 while (nz--) { 1061 idx = 7*(*vi--); 1062 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069 v -= 49; 1070 } 1071 } 1072 1073 /* copy t into x according to permutation */ 1074 ii = 0; 1075 for (i=0; i<n; i++) { 1076 ir = 7*r[i]; 1077 x[ir] = t[ii]; 1078 x[ir+1] = t[ii+1]; 1079 x[ir+2] = t[ii+2]; 1080 x[ir+3] = t[ii+3]; 1081 x[ir+4] = t[ii+4]; 1082 x[ir+5] = t[ii+5]; 1083 x[ir+6] = t[ii+6]; 1084 ii += 7; 1085 } 1086 1087 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1088 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1089 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1090 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1091 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1092 PetscFunctionReturn(0); 1093 } 1094 1095 /* ----------------------------------------------------------- */ 1096 #undef __FUNCT__ 1097 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1098 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1099 { 1100 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1101 IS iscol=a->col,isrow=a->row; 1102 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1103 int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 1104 MatScalar *aa=a->a,*v; 1105 PetscScalar *x,*b,*s,*t,*ls; 1106 1107 PetscFunctionBegin; 1108 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1109 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1110 t = a->solve_work; 1111 1112 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1113 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1114 1115 /* forward solve the lower triangular */ 1116 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1117 for (i=1; i<n; i++) { 1118 v = aa + bs2*ai[i]; 1119 vi = aj + ai[i]; 1120 nz = a->diag[i] - ai[i]; 1121 s = t + bs*i; 1122 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1123 while (nz--) { 1124 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1125 v += bs2; 1126 } 1127 } 1128 /* backward solve the upper triangular */ 1129 ls = a->solve_work + A->n; 1130 for (i=n-1; i>=0; i--){ 1131 v = aa + bs2*(a->diag[i] + 1); 1132 vi = aj + a->diag[i] + 1; 1133 nz = ai[i+1] - a->diag[i] - 1; 1134 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1135 while (nz--) { 1136 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1137 v += bs2; 1138 } 1139 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1140 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1141 } 1142 1143 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1144 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1145 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1146 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1147 PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 1148 PetscFunctionReturn(0); 1149 } 1150 1151 #undef __FUNCT__ 1152 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1153 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1154 { 1155 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1156 IS iscol=a->col,isrow=a->row; 1157 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1158 int *diag = a->diag; 1159 MatScalar *aa=a->a,*v; 1160 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1161 PetscScalar *x,*b,*t; 1162 1163 PetscFunctionBegin; 1164 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1165 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1166 t = a->solve_work; 1167 1168 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1169 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1170 1171 /* forward solve the lower triangular */ 1172 idx = 7*(*r++); 1173 t[0] = b[idx]; t[1] = b[1+idx]; 1174 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1175 t[5] = b[5+idx]; t[6] = b[6+idx]; 1176 1177 for (i=1; i<n; i++) { 1178 v = aa + 49*ai[i]; 1179 vi = aj + ai[i]; 1180 nz = diag[i] - ai[i]; 1181 idx = 7*(*r++); 1182 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1183 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1184 while (nz--) { 1185 idx = 7*(*vi++); 1186 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1187 x4 = t[3+idx];x5 = t[4+idx]; 1188 x6 = t[5+idx];x7 = t[6+idx]; 1189 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1190 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1191 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1192 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1193 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1194 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1195 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1196 v += 49; 1197 } 1198 idx = 7*i; 1199 t[idx] = s1;t[1+idx] = s2; 1200 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1201 t[5+idx] = s6;t[6+idx] = s7; 1202 } 1203 /* backward solve the upper triangular */ 1204 for (i=n-1; i>=0; i--){ 1205 v = aa + 49*diag[i] + 49; 1206 vi = aj + diag[i] + 1; 1207 nz = ai[i+1] - diag[i] - 1; 1208 idt = 7*i; 1209 s1 = t[idt]; s2 = t[1+idt]; 1210 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1211 s6 = t[5+idt];s7 = t[6+idt]; 1212 while (nz--) { 1213 idx = 7*(*vi++); 1214 x1 = t[idx]; x2 = t[1+idx]; 1215 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1216 x6 = t[5+idx]; x7 = t[6+idx]; 1217 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1218 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1219 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1220 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1221 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1222 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1223 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1224 v += 49; 1225 } 1226 idc = 7*(*c--); 1227 v = aa + 49*diag[i]; 1228 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1229 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1230 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1231 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1232 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1233 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1234 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1235 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1236 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1237 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1238 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1239 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1240 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1241 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1242 } 1243 1244 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1245 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1246 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1247 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1248 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1249 PetscFunctionReturn(0); 1250 } 1251 1252 #undef __FUNCT__ 1253 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1254 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1255 { 1256 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1257 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1258 int ierr,*diag = a->diag,jdx; 1259 MatScalar *aa=a->a,*v; 1260 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1261 1262 PetscFunctionBegin; 1263 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1264 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1265 /* forward solve the lower triangular */ 1266 idx = 0; 1267 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1268 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1269 x[6] = b[6+idx]; 1270 for (i=1; i<n; i++) { 1271 v = aa + 49*ai[i]; 1272 vi = aj + ai[i]; 1273 nz = diag[i] - ai[i]; 1274 idx = 7*i; 1275 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1276 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1277 s7 = b[6+idx]; 1278 while (nz--) { 1279 jdx = 7*(*vi++); 1280 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1281 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1282 x7 = x[6+jdx]; 1283 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1284 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1285 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1286 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1287 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1288 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1289 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1290 v += 49; 1291 } 1292 x[idx] = s1; 1293 x[1+idx] = s2; 1294 x[2+idx] = s3; 1295 x[3+idx] = s4; 1296 x[4+idx] = s5; 1297 x[5+idx] = s6; 1298 x[6+idx] = s7; 1299 } 1300 /* backward solve the upper triangular */ 1301 for (i=n-1; i>=0; i--){ 1302 v = aa + 49*diag[i] + 49; 1303 vi = aj + diag[i] + 1; 1304 nz = ai[i+1] - diag[i] - 1; 1305 idt = 7*i; 1306 s1 = x[idt]; s2 = x[1+idt]; 1307 s3 = x[2+idt]; s4 = x[3+idt]; 1308 s5 = x[4+idt]; s6 = x[5+idt]; 1309 s7 = x[6+idt]; 1310 while (nz--) { 1311 idx = 7*(*vi++); 1312 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1313 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1314 x7 = x[6+idx]; 1315 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1316 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1317 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1318 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1319 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1320 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1321 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1322 v += 49; 1323 } 1324 v = aa + 49*diag[i]; 1325 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1326 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1327 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1328 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1329 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1330 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1331 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1332 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1333 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1334 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1335 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1336 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1337 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1338 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1339 } 1340 1341 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1342 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1343 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1344 PetscFunctionReturn(0); 1345 } 1346 1347 #undef __FUNCT__ 1348 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1349 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1350 { 1351 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1352 IS iscol=a->col,isrow=a->row; 1353 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1354 int *diag = a->diag; 1355 MatScalar *aa=a->a,*v; 1356 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1357 1358 PetscFunctionBegin; 1359 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1360 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1361 t = a->solve_work; 1362 1363 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1364 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1365 1366 /* forward solve the lower triangular */ 1367 idx = 6*(*r++); 1368 t[0] = b[idx]; t[1] = b[1+idx]; 1369 t[2] = b[2+idx]; t[3] = b[3+idx]; 1370 t[4] = b[4+idx]; t[5] = b[5+idx]; 1371 for (i=1; i<n; i++) { 1372 v = aa + 36*ai[i]; 1373 vi = aj + ai[i]; 1374 nz = diag[i] - ai[i]; 1375 idx = 6*(*r++); 1376 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1377 s5 = b[4+idx]; s6 = b[5+idx]; 1378 while (nz--) { 1379 idx = 6*(*vi++); 1380 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1381 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1382 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1383 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1384 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1385 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1386 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1387 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1388 v += 36; 1389 } 1390 idx = 6*i; 1391 t[idx] = s1;t[1+idx] = s2; 1392 t[2+idx] = s3;t[3+idx] = s4; 1393 t[4+idx] = s5;t[5+idx] = s6; 1394 } 1395 /* backward solve the upper triangular */ 1396 for (i=n-1; i>=0; i--){ 1397 v = aa + 36*diag[i] + 36; 1398 vi = aj + diag[i] + 1; 1399 nz = ai[i+1] - diag[i] - 1; 1400 idt = 6*i; 1401 s1 = t[idt]; s2 = t[1+idt]; 1402 s3 = t[2+idt];s4 = t[3+idt]; 1403 s5 = t[4+idt];s6 = t[5+idt]; 1404 while (nz--) { 1405 idx = 6*(*vi++); 1406 x1 = t[idx]; x2 = t[1+idx]; 1407 x3 = t[2+idx]; x4 = t[3+idx]; 1408 x5 = t[4+idx]; x6 = t[5+idx]; 1409 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1410 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1411 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1412 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1413 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1414 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1415 v += 36; 1416 } 1417 idc = 6*(*c--); 1418 v = aa + 36*diag[i]; 1419 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1420 v[18]*s4+v[24]*s5+v[30]*s6; 1421 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1422 v[19]*s4+v[25]*s5+v[31]*s6; 1423 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1424 v[20]*s4+v[26]*s5+v[32]*s6; 1425 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1426 v[21]*s4+v[27]*s5+v[33]*s6; 1427 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1428 v[22]*s4+v[28]*s5+v[34]*s6; 1429 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1430 v[23]*s4+v[29]*s5+v[35]*s6; 1431 } 1432 1433 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1434 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1435 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1436 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1437 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1438 PetscFunctionReturn(0); 1439 } 1440 1441 #undef __FUNCT__ 1442 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1443 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1444 { 1445 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1446 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1447 int ierr,*diag = a->diag,jdx; 1448 MatScalar *aa=a->a,*v; 1449 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1450 1451 PetscFunctionBegin; 1452 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1453 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1454 /* forward solve the lower triangular */ 1455 idx = 0; 1456 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1457 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1458 for (i=1; i<n; i++) { 1459 v = aa + 36*ai[i]; 1460 vi = aj + ai[i]; 1461 nz = diag[i] - ai[i]; 1462 idx = 6*i; 1463 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1464 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1465 while (nz--) { 1466 jdx = 6*(*vi++); 1467 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1468 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1469 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1470 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1471 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1472 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1473 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1474 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1475 v += 36; 1476 } 1477 x[idx] = s1; 1478 x[1+idx] = s2; 1479 x[2+idx] = s3; 1480 x[3+idx] = s4; 1481 x[4+idx] = s5; 1482 x[5+idx] = s6; 1483 } 1484 /* backward solve the upper triangular */ 1485 for (i=n-1; i>=0; i--){ 1486 v = aa + 36*diag[i] + 36; 1487 vi = aj + diag[i] + 1; 1488 nz = ai[i+1] - diag[i] - 1; 1489 idt = 6*i; 1490 s1 = x[idt]; s2 = x[1+idt]; 1491 s3 = x[2+idt]; s4 = x[3+idt]; 1492 s5 = x[4+idt]; s6 = x[5+idt]; 1493 while (nz--) { 1494 idx = 6*(*vi++); 1495 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1496 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1497 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1498 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1499 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1500 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1501 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1502 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1503 v += 36; 1504 } 1505 v = aa + 36*diag[i]; 1506 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1507 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1508 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1509 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1510 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1511 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1512 } 1513 1514 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1515 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1516 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1517 PetscFunctionReturn(0); 1518 } 1519 1520 #undef __FUNCT__ 1521 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1522 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1523 { 1524 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1525 IS iscol=a->col,isrow=a->row; 1526 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1527 int *diag = a->diag; 1528 MatScalar *aa=a->a,*v; 1529 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1530 1531 PetscFunctionBegin; 1532 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1533 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1534 t = a->solve_work; 1535 1536 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1537 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1538 1539 /* forward solve the lower triangular */ 1540 idx = 5*(*r++); 1541 t[0] = b[idx]; t[1] = b[1+idx]; 1542 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1543 for (i=1; i<n; i++) { 1544 v = aa + 25*ai[i]; 1545 vi = aj + ai[i]; 1546 nz = diag[i] - ai[i]; 1547 idx = 5*(*r++); 1548 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1549 s5 = b[4+idx]; 1550 while (nz--) { 1551 idx = 5*(*vi++); 1552 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1553 x4 = t[3+idx];x5 = t[4+idx]; 1554 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1555 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1556 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1557 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1558 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1559 v += 25; 1560 } 1561 idx = 5*i; 1562 t[idx] = s1;t[1+idx] = s2; 1563 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1564 } 1565 /* backward solve the upper triangular */ 1566 for (i=n-1; i>=0; i--){ 1567 v = aa + 25*diag[i] + 25; 1568 vi = aj + diag[i] + 1; 1569 nz = ai[i+1] - diag[i] - 1; 1570 idt = 5*i; 1571 s1 = t[idt]; s2 = t[1+idt]; 1572 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1573 while (nz--) { 1574 idx = 5*(*vi++); 1575 x1 = t[idx]; x2 = t[1+idx]; 1576 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1577 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1578 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1579 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1580 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1581 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1582 v += 25; 1583 } 1584 idc = 5*(*c--); 1585 v = aa + 25*diag[i]; 1586 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1587 v[15]*s4+v[20]*s5; 1588 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1589 v[16]*s4+v[21]*s5; 1590 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1591 v[17]*s4+v[22]*s5; 1592 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1593 v[18]*s4+v[23]*s5; 1594 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1595 v[19]*s4+v[24]*s5; 1596 } 1597 1598 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1599 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1600 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1601 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1602 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1603 PetscFunctionReturn(0); 1604 } 1605 1606 #undef __FUNCT__ 1607 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1608 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1609 { 1610 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1611 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1612 int ierr,*diag = a->diag,jdx; 1613 MatScalar *aa=a->a,*v; 1614 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1615 1616 PetscFunctionBegin; 1617 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1618 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1619 /* forward solve the lower triangular */ 1620 idx = 0; 1621 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1622 for (i=1; i<n; i++) { 1623 v = aa + 25*ai[i]; 1624 vi = aj + ai[i]; 1625 nz = diag[i] - ai[i]; 1626 idx = 5*i; 1627 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1628 while (nz--) { 1629 jdx = 5*(*vi++); 1630 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1631 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1632 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1633 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1634 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1635 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1636 v += 25; 1637 } 1638 x[idx] = s1; 1639 x[1+idx] = s2; 1640 x[2+idx] = s3; 1641 x[3+idx] = s4; 1642 x[4+idx] = s5; 1643 } 1644 /* backward solve the upper triangular */ 1645 for (i=n-1; i>=0; i--){ 1646 v = aa + 25*diag[i] + 25; 1647 vi = aj + diag[i] + 1; 1648 nz = ai[i+1] - diag[i] - 1; 1649 idt = 5*i; 1650 s1 = x[idt]; s2 = x[1+idt]; 1651 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1652 while (nz--) { 1653 idx = 5*(*vi++); 1654 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1655 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1656 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1657 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1658 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1659 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1660 v += 25; 1661 } 1662 v = aa + 25*diag[i]; 1663 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1664 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1665 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1666 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1667 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1668 } 1669 1670 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1671 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1672 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1673 PetscFunctionReturn(0); 1674 } 1675 1676 #undef __FUNCT__ 1677 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1678 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1679 { 1680 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1681 IS iscol=a->col,isrow=a->row; 1682 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1683 int *diag = a->diag; 1684 MatScalar *aa=a->a,*v; 1685 PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1686 1687 PetscFunctionBegin; 1688 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1689 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1690 t = a->solve_work; 1691 1692 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1693 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1694 1695 /* forward solve the lower triangular */ 1696 idx = 4*(*r++); 1697 t[0] = b[idx]; t[1] = b[1+idx]; 1698 t[2] = b[2+idx]; t[3] = b[3+idx]; 1699 for (i=1; i<n; i++) { 1700 v = aa + 16*ai[i]; 1701 vi = aj + ai[i]; 1702 nz = diag[i] - ai[i]; 1703 idx = 4*(*r++); 1704 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1705 while (nz--) { 1706 idx = 4*(*vi++); 1707 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1708 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1709 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1710 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1711 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1712 v += 16; 1713 } 1714 idx = 4*i; 1715 t[idx] = s1;t[1+idx] = s2; 1716 t[2+idx] = s3;t[3+idx] = s4; 1717 } 1718 /* backward solve the upper triangular */ 1719 for (i=n-1; i>=0; i--){ 1720 v = aa + 16*diag[i] + 16; 1721 vi = aj + diag[i] + 1; 1722 nz = ai[i+1] - diag[i] - 1; 1723 idt = 4*i; 1724 s1 = t[idt]; s2 = t[1+idt]; 1725 s3 = t[2+idt];s4 = t[3+idt]; 1726 while (nz--) { 1727 idx = 4*(*vi++); 1728 x1 = t[idx]; x2 = t[1+idx]; 1729 x3 = t[2+idx]; x4 = t[3+idx]; 1730 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1731 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1732 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1733 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1734 v += 16; 1735 } 1736 idc = 4*(*c--); 1737 v = aa + 16*diag[i]; 1738 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1739 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1740 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1741 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1742 } 1743 1744 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1745 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1746 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1747 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1748 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1749 PetscFunctionReturn(0); 1750 } 1751 1752 #undef __FUNCT__ 1753 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1754 int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1755 { 1756 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1757 IS iscol=a->col,isrow=a->row; 1758 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1759 int *diag = a->diag; 1760 MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1761 PetscScalar *x,*b; 1762 1763 PetscFunctionBegin; 1764 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1765 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1766 t = (MatScalar *)a->solve_work; 1767 1768 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1769 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1770 1771 /* forward solve the lower triangular */ 1772 idx = 4*(*r++); 1773 t[0] = (MatScalar)b[idx]; 1774 t[1] = (MatScalar)b[1+idx]; 1775 t[2] = (MatScalar)b[2+idx]; 1776 t[3] = (MatScalar)b[3+idx]; 1777 for (i=1; i<n; i++) { 1778 v = aa + 16*ai[i]; 1779 vi = aj + ai[i]; 1780 nz = diag[i] - ai[i]; 1781 idx = 4*(*r++); 1782 s1 = (MatScalar)b[idx]; 1783 s2 = (MatScalar)b[1+idx]; 1784 s3 = (MatScalar)b[2+idx]; 1785 s4 = (MatScalar)b[3+idx]; 1786 while (nz--) { 1787 idx = 4*(*vi++); 1788 x1 = t[idx]; 1789 x2 = t[1+idx]; 1790 x3 = t[2+idx]; 1791 x4 = t[3+idx]; 1792 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1793 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1794 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1795 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1796 v += 16; 1797 } 1798 idx = 4*i; 1799 t[idx] = s1; 1800 t[1+idx] = s2; 1801 t[2+idx] = s3; 1802 t[3+idx] = s4; 1803 } 1804 /* backward solve the upper triangular */ 1805 for (i=n-1; i>=0; i--){ 1806 v = aa + 16*diag[i] + 16; 1807 vi = aj + diag[i] + 1; 1808 nz = ai[i+1] - diag[i] - 1; 1809 idt = 4*i; 1810 s1 = t[idt]; 1811 s2 = t[1+idt]; 1812 s3 = t[2+idt]; 1813 s4 = t[3+idt]; 1814 while (nz--) { 1815 idx = 4*(*vi++); 1816 x1 = t[idx]; 1817 x2 = t[1+idx]; 1818 x3 = t[2+idx]; 1819 x4 = t[3+idx]; 1820 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1821 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1822 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1823 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1824 v += 16; 1825 } 1826 idc = 4*(*c--); 1827 v = aa + 16*diag[i]; 1828 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1829 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1830 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1831 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1832 x[idc] = (PetscScalar)t[idt]; 1833 x[1+idc] = (PetscScalar)t[1+idt]; 1834 x[2+idc] = (PetscScalar)t[2+idt]; 1835 x[3+idc] = (PetscScalar)t[3+idt]; 1836 } 1837 1838 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1839 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1840 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1841 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1842 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1843 PetscFunctionReturn(0); 1844 } 1845 1846 #if defined (PETSC_HAVE_SSE) 1847 1848 #include PETSC_HAVE_SSE 1849 1850 #undef __FUNCT__ 1851 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1852 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1853 { 1854 /* 1855 Note: This code uses demotion of double 1856 to float when performing the mixed-mode computation. 1857 This may not be numerically reasonable for all applications. 1858 */ 1859 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1860 IS iscol=a->col,isrow=a->row; 1861 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1862 int *diag = a->diag,ai16; 1863 MatScalar *aa=a->a,*v; 1864 PetscScalar *x,*b,*t; 1865 1866 /* Make space in temp stack for 16 Byte Aligned arrays */ 1867 float ssealignedspace[11],*tmps,*tmpx; 1868 unsigned long offset; 1869 1870 PetscFunctionBegin; 1871 SSE_SCOPE_BEGIN; 1872 1873 offset = (unsigned long)ssealignedspace % 16; 1874 if (offset) offset = (16 - offset)/4; 1875 tmps = &ssealignedspace[offset]; 1876 tmpx = &ssealignedspace[offset+4]; 1877 PREFETCH_NTA(aa+16*ai[1]); 1878 1879 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1880 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1881 t = a->solve_work; 1882 1883 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1884 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1885 1886 /* forward solve the lower triangular */ 1887 idx = 4*(*r++); 1888 t[0] = b[idx]; t[1] = b[1+idx]; 1889 t[2] = b[2+idx]; t[3] = b[3+idx]; 1890 v = aa + 16*ai[1]; 1891 1892 for (i=1; i<n;) { 1893 PREFETCH_NTA(&v[8]); 1894 vi = aj + ai[i]; 1895 nz = diag[i] - ai[i]; 1896 idx = 4*(*r++); 1897 1898 /* Demote sum from double to float */ 1899 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1900 LOAD_PS(tmps,XMM7); 1901 1902 while (nz--) { 1903 PREFETCH_NTA(&v[16]); 1904 idx = 4*(*vi++); 1905 1906 /* Demote solution (so far) from double to float */ 1907 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1908 1909 /* 4x4 Matrix-Vector product with negative accumulation: */ 1910 SSE_INLINE_BEGIN_2(tmpx,v) 1911 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1912 1913 /* First Column */ 1914 SSE_COPY_PS(XMM0,XMM6) 1915 SSE_SHUFFLE(XMM0,XMM0,0x00) 1916 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1917 SSE_SUB_PS(XMM7,XMM0) 1918 1919 /* Second Column */ 1920 SSE_COPY_PS(XMM1,XMM6) 1921 SSE_SHUFFLE(XMM1,XMM1,0x55) 1922 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1923 SSE_SUB_PS(XMM7,XMM1) 1924 1925 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1926 1927 /* Third Column */ 1928 SSE_COPY_PS(XMM2,XMM6) 1929 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1930 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1931 SSE_SUB_PS(XMM7,XMM2) 1932 1933 /* Fourth Column */ 1934 SSE_COPY_PS(XMM3,XMM6) 1935 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1936 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1937 SSE_SUB_PS(XMM7,XMM3) 1938 SSE_INLINE_END_2 1939 1940 v += 16; 1941 } 1942 idx = 4*i; 1943 v = aa + 16*ai[++i]; 1944 PREFETCH_NTA(v); 1945 STORE_PS(tmps,XMM7); 1946 1947 /* Promote result from float to double */ 1948 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1949 } 1950 /* backward solve the upper triangular */ 1951 idt = 4*(n-1); 1952 ai16 = 16*diag[n-1]; 1953 v = aa + ai16 + 16; 1954 for (i=n-1; i>=0;){ 1955 PREFETCH_NTA(&v[8]); 1956 vi = aj + diag[i] + 1; 1957 nz = ai[i+1] - diag[i] - 1; 1958 1959 /* Demote accumulator from double to float */ 1960 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1961 LOAD_PS(tmps,XMM7); 1962 1963 while (nz--) { 1964 PREFETCH_NTA(&v[16]); 1965 idx = 4*(*vi++); 1966 1967 /* Demote solution (so far) from double to float */ 1968 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 1969 1970 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1971 SSE_INLINE_BEGIN_2(tmpx,v) 1972 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1973 1974 /* First Column */ 1975 SSE_COPY_PS(XMM0,XMM6) 1976 SSE_SHUFFLE(XMM0,XMM0,0x00) 1977 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1978 SSE_SUB_PS(XMM7,XMM0) 1979 1980 /* Second Column */ 1981 SSE_COPY_PS(XMM1,XMM6) 1982 SSE_SHUFFLE(XMM1,XMM1,0x55) 1983 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1984 SSE_SUB_PS(XMM7,XMM1) 1985 1986 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1987 1988 /* Third Column */ 1989 SSE_COPY_PS(XMM2,XMM6) 1990 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1991 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1992 SSE_SUB_PS(XMM7,XMM2) 1993 1994 /* Fourth Column */ 1995 SSE_COPY_PS(XMM3,XMM6) 1996 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1997 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1998 SSE_SUB_PS(XMM7,XMM3) 1999 SSE_INLINE_END_2 2000 v += 16; 2001 } 2002 v = aa + ai16; 2003 ai16 = 16*diag[--i]; 2004 PREFETCH_NTA(aa+ai16+16); 2005 /* 2006 Scale the result by the diagonal 4x4 block, 2007 which was inverted as part of the factorization 2008 */ 2009 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2010 /* First Column */ 2011 SSE_COPY_PS(XMM0,XMM7) 2012 SSE_SHUFFLE(XMM0,XMM0,0x00) 2013 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2014 2015 /* Second Column */ 2016 SSE_COPY_PS(XMM1,XMM7) 2017 SSE_SHUFFLE(XMM1,XMM1,0x55) 2018 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2019 SSE_ADD_PS(XMM0,XMM1) 2020 2021 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2022 2023 /* Third Column */ 2024 SSE_COPY_PS(XMM2,XMM7) 2025 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2026 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2027 SSE_ADD_PS(XMM0,XMM2) 2028 2029 /* Fourth Column */ 2030 SSE_COPY_PS(XMM3,XMM7) 2031 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2032 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2033 SSE_ADD_PS(XMM0,XMM3) 2034 2035 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2036 SSE_INLINE_END_3 2037 2038 /* Promote solution from float to double */ 2039 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2040 2041 /* Apply reordering to t and stream into x. */ 2042 /* This way, x doesn't pollute the cache. */ 2043 /* Be careful with size: 2 doubles = 4 floats! */ 2044 idc = 4*(*c--); 2045 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2046 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2047 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2048 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2049 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2050 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2051 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2052 SSE_INLINE_END_2 2053 v = aa + ai16 + 16; 2054 idt -= 4; 2055 } 2056 2057 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2058 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2059 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2060 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2061 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2062 SSE_SCOPE_END; 2063 PetscFunctionReturn(0); 2064 } 2065 2066 #endif 2067 2068 2069 /* 2070 Special case where the matrix was ILU(0) factored in the natural 2071 ordering. This eliminates the need for the column and row permutation. 2072 */ 2073 #undef __FUNCT__ 2074 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2075 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2076 { 2077 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2078 int n=a->mbs,*ai=a->i,*aj=a->j; 2079 int ierr,*diag = a->diag; 2080 MatScalar *aa=a->a; 2081 PetscScalar *x,*b; 2082 2083 PetscFunctionBegin; 2084 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2085 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2086 2087 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2088 { 2089 static PetscScalar w[2000]; /* very BAD need to fix */ 2090 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2091 } 2092 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2093 { 2094 static PetscScalar w[2000]; /* very BAD need to fix */ 2095 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2096 } 2097 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2098 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2099 #else 2100 { 2101 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2102 MatScalar *v; 2103 int jdx,idt,idx,nz,*vi,i,ai16; 2104 2105 /* forward solve the lower triangular */ 2106 idx = 0; 2107 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2108 for (i=1; i<n; i++) { 2109 v = aa + 16*ai[i]; 2110 vi = aj + ai[i]; 2111 nz = diag[i] - ai[i]; 2112 idx += 4; 2113 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2114 while (nz--) { 2115 jdx = 4*(*vi++); 2116 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2117 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2118 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2119 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2120 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2121 v += 16; 2122 } 2123 x[idx] = s1; 2124 x[1+idx] = s2; 2125 x[2+idx] = s3; 2126 x[3+idx] = s4; 2127 } 2128 /* backward solve the upper triangular */ 2129 idt = 4*(n-1); 2130 for (i=n-1; i>=0; i--){ 2131 ai16 = 16*diag[i]; 2132 v = aa + ai16 + 16; 2133 vi = aj + diag[i] + 1; 2134 nz = ai[i+1] - diag[i] - 1; 2135 s1 = x[idt]; s2 = x[1+idt]; 2136 s3 = x[2+idt];s4 = x[3+idt]; 2137 while (nz--) { 2138 idx = 4*(*vi++); 2139 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2140 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2141 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2142 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2143 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2144 v += 16; 2145 } 2146 v = aa + ai16; 2147 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2148 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2149 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2150 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2151 idt -= 4; 2152 } 2153 } 2154 #endif 2155 2156 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2157 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2158 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2159 PetscFunctionReturn(0); 2160 } 2161 2162 #undef __FUNCT__ 2163 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2164 int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2165 { 2166 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2167 int n=a->mbs,*ai=a->i,*aj=a->j; 2168 int ierr,*diag = a->diag; 2169 MatScalar *aa=a->a; 2170 PetscScalar *x,*b; 2171 2172 PetscFunctionBegin; 2173 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2174 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2175 2176 { 2177 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2178 MatScalar *v,*t=(MatScalar *)x; 2179 int jdx,idt,idx,nz,*vi,i,ai16; 2180 2181 /* forward solve the lower triangular */ 2182 idx = 0; 2183 t[0] = (MatScalar)b[0]; 2184 t[1] = (MatScalar)b[1]; 2185 t[2] = (MatScalar)b[2]; 2186 t[3] = (MatScalar)b[3]; 2187 for (i=1; i<n; i++) { 2188 v = aa + 16*ai[i]; 2189 vi = aj + ai[i]; 2190 nz = diag[i] - ai[i]; 2191 idx += 4; 2192 s1 = (MatScalar)b[idx]; 2193 s2 = (MatScalar)b[1+idx]; 2194 s3 = (MatScalar)b[2+idx]; 2195 s4 = (MatScalar)b[3+idx]; 2196 while (nz--) { 2197 jdx = 4*(*vi++); 2198 x1 = t[jdx]; 2199 x2 = t[1+jdx]; 2200 x3 = t[2+jdx]; 2201 x4 = t[3+jdx]; 2202 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2203 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2204 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2205 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2206 v += 16; 2207 } 2208 t[idx] = s1; 2209 t[1+idx] = s2; 2210 t[2+idx] = s3; 2211 t[3+idx] = s4; 2212 } 2213 /* backward solve the upper triangular */ 2214 idt = 4*(n-1); 2215 for (i=n-1; i>=0; i--){ 2216 ai16 = 16*diag[i]; 2217 v = aa + ai16 + 16; 2218 vi = aj + diag[i] + 1; 2219 nz = ai[i+1] - diag[i] - 1; 2220 s1 = t[idt]; 2221 s2 = t[1+idt]; 2222 s3 = t[2+idt]; 2223 s4 = t[3+idt]; 2224 while (nz--) { 2225 idx = 4*(*vi++); 2226 x1 = (MatScalar)x[idx]; 2227 x2 = (MatScalar)x[1+idx]; 2228 x3 = (MatScalar)x[2+idx]; 2229 x4 = (MatScalar)x[3+idx]; 2230 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2231 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2232 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2233 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2234 v += 16; 2235 } 2236 v = aa + ai16; 2237 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2238 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2239 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2240 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2241 idt -= 4; 2242 } 2243 } 2244 2245 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2246 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2247 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2248 PetscFunctionReturn(0); 2249 } 2250 2251 #if defined (PETSC_HAVE_SSE) 2252 2253 #include PETSC_HAVE_SSE 2254 #include "src/vec/vecimpl.h" /* to allow VecGetArrayFast() */ 2255 #undef __FUNCT__ 2256 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2257 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2258 { 2259 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2260 unsigned short *aj=(unsigned short *)a->j; 2261 int ierr,*ai=a->i,n=a->mbs,*diag = a->diag; 2262 MatScalar *aa=a->a; 2263 PetscScalar *x,*b; 2264 2265 PetscFunctionBegin; 2266 SSE_SCOPE_BEGIN; 2267 /* 2268 Note: This code currently uses demotion of double 2269 to float when performing the mixed-mode computation. 2270 This may not be numerically reasonable for all applications. 2271 */ 2272 PREFETCH_NTA(aa+16*ai[1]); 2273 2274 ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 2275 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 2276 { 2277 /* x will first be computed in single precision then promoted inplace to double */ 2278 MatScalar *v,*t=(MatScalar *)x; 2279 int nz,i,idt,ai16; 2280 unsigned int jdx,idx; 2281 unsigned short *vi; 2282 /* Forward solve the lower triangular factor. */ 2283 2284 /* First block is the identity. */ 2285 idx = 0; 2286 CONVERT_DOUBLE4_FLOAT4(t,b); 2287 v = aa + 16*((unsigned int)ai[1]); 2288 2289 for (i=1; i<n;) { 2290 PREFETCH_NTA(&v[8]); 2291 vi = aj + ai[i]; 2292 nz = diag[i] - ai[i]; 2293 idx += 4; 2294 2295 /* Demote RHS from double to float. */ 2296 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2297 LOAD_PS(&t[idx],XMM7); 2298 2299 while (nz--) { 2300 PREFETCH_NTA(&v[16]); 2301 jdx = 4*((unsigned int)(*vi++)); 2302 2303 /* 4x4 Matrix-Vector product with negative accumulation: */ 2304 SSE_INLINE_BEGIN_2(&t[jdx],v) 2305 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2306 2307 /* First Column */ 2308 SSE_COPY_PS(XMM0,XMM6) 2309 SSE_SHUFFLE(XMM0,XMM0,0x00) 2310 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2311 SSE_SUB_PS(XMM7,XMM0) 2312 2313 /* Second Column */ 2314 SSE_COPY_PS(XMM1,XMM6) 2315 SSE_SHUFFLE(XMM1,XMM1,0x55) 2316 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2317 SSE_SUB_PS(XMM7,XMM1) 2318 2319 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2320 2321 /* Third Column */ 2322 SSE_COPY_PS(XMM2,XMM6) 2323 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2324 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2325 SSE_SUB_PS(XMM7,XMM2) 2326 2327 /* Fourth Column */ 2328 SSE_COPY_PS(XMM3,XMM6) 2329 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2330 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2331 SSE_SUB_PS(XMM7,XMM3) 2332 SSE_INLINE_END_2 2333 2334 v += 16; 2335 } 2336 v = aa + 16*ai[++i]; 2337 PREFETCH_NTA(v); 2338 STORE_PS(&t[idx],XMM7); 2339 } 2340 2341 /* Backward solve the upper triangular factor.*/ 2342 2343 idt = 4*(n-1); 2344 ai16 = 16*diag[n-1]; 2345 v = aa + ai16 + 16; 2346 for (i=n-1; i>=0;){ 2347 PREFETCH_NTA(&v[8]); 2348 vi = aj + diag[i] + 1; 2349 nz = ai[i+1] - diag[i] - 1; 2350 2351 LOAD_PS(&t[idt],XMM7); 2352 2353 while (nz--) { 2354 PREFETCH_NTA(&v[16]); 2355 idx = 4*((unsigned int)(*vi++)); 2356 2357 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2358 SSE_INLINE_BEGIN_2(&t[idx],v) 2359 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2360 2361 /* First Column */ 2362 SSE_COPY_PS(XMM0,XMM6) 2363 SSE_SHUFFLE(XMM0,XMM0,0x00) 2364 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2365 SSE_SUB_PS(XMM7,XMM0) 2366 2367 /* Second Column */ 2368 SSE_COPY_PS(XMM1,XMM6) 2369 SSE_SHUFFLE(XMM1,XMM1,0x55) 2370 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2371 SSE_SUB_PS(XMM7,XMM1) 2372 2373 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2374 2375 /* Third Column */ 2376 SSE_COPY_PS(XMM2,XMM6) 2377 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2378 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2379 SSE_SUB_PS(XMM7,XMM2) 2380 2381 /* Fourth Column */ 2382 SSE_COPY_PS(XMM3,XMM6) 2383 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2384 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2385 SSE_SUB_PS(XMM7,XMM3) 2386 SSE_INLINE_END_2 2387 v += 16; 2388 } 2389 v = aa + ai16; 2390 ai16 = 16*diag[--i]; 2391 PREFETCH_NTA(aa+ai16+16); 2392 /* 2393 Scale the result by the diagonal 4x4 block, 2394 which was inverted as part of the factorization 2395 */ 2396 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2397 /* First Column */ 2398 SSE_COPY_PS(XMM0,XMM7) 2399 SSE_SHUFFLE(XMM0,XMM0,0x00) 2400 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2401 2402 /* Second Column */ 2403 SSE_COPY_PS(XMM1,XMM7) 2404 SSE_SHUFFLE(XMM1,XMM1,0x55) 2405 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2406 SSE_ADD_PS(XMM0,XMM1) 2407 2408 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2409 2410 /* Third Column */ 2411 SSE_COPY_PS(XMM2,XMM7) 2412 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2413 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2414 SSE_ADD_PS(XMM0,XMM2) 2415 2416 /* Fourth Column */ 2417 SSE_COPY_PS(XMM3,XMM7) 2418 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2419 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2420 SSE_ADD_PS(XMM0,XMM3) 2421 2422 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2423 SSE_INLINE_END_3 2424 2425 v = aa + ai16 + 16; 2426 idt -= 4; 2427 } 2428 2429 /* Convert t from single precision back to double precision (inplace)*/ 2430 idt = 4*(n-1); 2431 for (i=n-1;i>=0;i--) { 2432 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2433 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2434 PetscScalar *xtemp=&x[idt]; 2435 MatScalar *ttemp=&t[idt]; 2436 xtemp[3] = (PetscScalar)ttemp[3]; 2437 xtemp[2] = (PetscScalar)ttemp[2]; 2438 xtemp[1] = (PetscScalar)ttemp[1]; 2439 xtemp[0] = (PetscScalar)ttemp[0]; 2440 idt -= 4; 2441 } 2442 2443 } /* End of artificial scope. */ 2444 ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 2445 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 2446 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2447 SSE_SCOPE_END; 2448 PetscFunctionReturn(0); 2449 } 2450 2451 #undef __FUNCT__ 2452 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2453 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2454 { 2455 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2456 int *aj=a->j; 2457 int ierr,*ai=a->i,n=a->mbs,*diag = a->diag; 2458 MatScalar *aa=a->a; 2459 PetscScalar *x,*b; 2460 2461 PetscFunctionBegin; 2462 SSE_SCOPE_BEGIN; 2463 /* 2464 Note: This code currently uses demotion of double 2465 to float when performing the mixed-mode computation. 2466 This may not be numerically reasonable for all applications. 2467 */ 2468 PREFETCH_NTA(aa+16*ai[1]); 2469 2470 ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 2471 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 2472 { 2473 /* x will first be computed in single precision then promoted inplace to double */ 2474 MatScalar *v,*t=(MatScalar *)x; 2475 int nz,i,idt,ai16; 2476 int jdx,idx; 2477 int *vi; 2478 /* Forward solve the lower triangular factor. */ 2479 2480 /* First block is the identity. */ 2481 idx = 0; 2482 CONVERT_DOUBLE4_FLOAT4(t,b); 2483 v = aa + 16*ai[1]; 2484 2485 for (i=1; i<n;) { 2486 PREFETCH_NTA(&v[8]); 2487 vi = aj + ai[i]; 2488 nz = diag[i] - ai[i]; 2489 idx += 4; 2490 2491 /* Demote RHS from double to float. */ 2492 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2493 LOAD_PS(&t[idx],XMM7); 2494 2495 while (nz--) { 2496 PREFETCH_NTA(&v[16]); 2497 jdx = 4*(*vi++); 2498 /* jdx = *vi++; */ 2499 2500 /* 4x4 Matrix-Vector product with negative accumulation: */ 2501 SSE_INLINE_BEGIN_2(&t[jdx],v) 2502 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2503 2504 /* First Column */ 2505 SSE_COPY_PS(XMM0,XMM6) 2506 SSE_SHUFFLE(XMM0,XMM0,0x00) 2507 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2508 SSE_SUB_PS(XMM7,XMM0) 2509 2510 /* Second Column */ 2511 SSE_COPY_PS(XMM1,XMM6) 2512 SSE_SHUFFLE(XMM1,XMM1,0x55) 2513 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2514 SSE_SUB_PS(XMM7,XMM1) 2515 2516 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2517 2518 /* Third Column */ 2519 SSE_COPY_PS(XMM2,XMM6) 2520 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2521 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2522 SSE_SUB_PS(XMM7,XMM2) 2523 2524 /* Fourth Column */ 2525 SSE_COPY_PS(XMM3,XMM6) 2526 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2527 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2528 SSE_SUB_PS(XMM7,XMM3) 2529 SSE_INLINE_END_2 2530 2531 v += 16; 2532 } 2533 v = aa + 16*ai[++i]; 2534 PREFETCH_NTA(v); 2535 STORE_PS(&t[idx],XMM7); 2536 } 2537 2538 /* Backward solve the upper triangular factor.*/ 2539 2540 idt = 4*(n-1); 2541 ai16 = 16*diag[n-1]; 2542 v = aa + ai16 + 16; 2543 for (i=n-1; i>=0;){ 2544 PREFETCH_NTA(&v[8]); 2545 vi = aj + diag[i] + 1; 2546 nz = ai[i+1] - diag[i] - 1; 2547 2548 LOAD_PS(&t[idt],XMM7); 2549 2550 while (nz--) { 2551 PREFETCH_NTA(&v[16]); 2552 idx = 4*(*vi++); 2553 /* idx = *vi++; */ 2554 2555 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2556 SSE_INLINE_BEGIN_2(&t[idx],v) 2557 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2558 2559 /* First Column */ 2560 SSE_COPY_PS(XMM0,XMM6) 2561 SSE_SHUFFLE(XMM0,XMM0,0x00) 2562 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2563 SSE_SUB_PS(XMM7,XMM0) 2564 2565 /* Second Column */ 2566 SSE_COPY_PS(XMM1,XMM6) 2567 SSE_SHUFFLE(XMM1,XMM1,0x55) 2568 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2569 SSE_SUB_PS(XMM7,XMM1) 2570 2571 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2572 2573 /* Third Column */ 2574 SSE_COPY_PS(XMM2,XMM6) 2575 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2576 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2577 SSE_SUB_PS(XMM7,XMM2) 2578 2579 /* Fourth Column */ 2580 SSE_COPY_PS(XMM3,XMM6) 2581 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2582 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2583 SSE_SUB_PS(XMM7,XMM3) 2584 SSE_INLINE_END_2 2585 v += 16; 2586 } 2587 v = aa + ai16; 2588 ai16 = 16*diag[--i]; 2589 PREFETCH_NTA(aa+ai16+16); 2590 /* 2591 Scale the result by the diagonal 4x4 block, 2592 which was inverted as part of the factorization 2593 */ 2594 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2595 /* First Column */ 2596 SSE_COPY_PS(XMM0,XMM7) 2597 SSE_SHUFFLE(XMM0,XMM0,0x00) 2598 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2599 2600 /* Second Column */ 2601 SSE_COPY_PS(XMM1,XMM7) 2602 SSE_SHUFFLE(XMM1,XMM1,0x55) 2603 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2604 SSE_ADD_PS(XMM0,XMM1) 2605 2606 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2607 2608 /* Third Column */ 2609 SSE_COPY_PS(XMM2,XMM7) 2610 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2611 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2612 SSE_ADD_PS(XMM0,XMM2) 2613 2614 /* Fourth Column */ 2615 SSE_COPY_PS(XMM3,XMM7) 2616 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2617 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2618 SSE_ADD_PS(XMM0,XMM3) 2619 2620 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2621 SSE_INLINE_END_3 2622 2623 v = aa + ai16 + 16; 2624 idt -= 4; 2625 } 2626 2627 /* Convert t from single precision back to double precision (inplace)*/ 2628 idt = 4*(n-1); 2629 for (i=n-1;i>=0;i--) { 2630 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2631 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2632 PetscScalar *xtemp=&x[idt]; 2633 MatScalar *ttemp=&t[idt]; 2634 xtemp[3] = (PetscScalar)ttemp[3]; 2635 xtemp[2] = (PetscScalar)ttemp[2]; 2636 xtemp[1] = (PetscScalar)ttemp[1]; 2637 xtemp[0] = (PetscScalar)ttemp[0]; 2638 idt -= 4; 2639 } 2640 2641 } /* End of artificial scope. */ 2642 ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 2643 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 2644 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2645 SSE_SCOPE_END; 2646 PetscFunctionReturn(0); 2647 } 2648 2649 #endif 2650 #undef __FUNCT__ 2651 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2652 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2653 { 2654 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2655 IS iscol=a->col,isrow=a->row; 2656 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2657 int *diag = a->diag; 2658 MatScalar *aa=a->a,*v; 2659 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 2660 2661 PetscFunctionBegin; 2662 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2663 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2664 t = a->solve_work; 2665 2666 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2667 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2668 2669 /* forward solve the lower triangular */ 2670 idx = 3*(*r++); 2671 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2672 for (i=1; i<n; i++) { 2673 v = aa + 9*ai[i]; 2674 vi = aj + ai[i]; 2675 nz = diag[i] - ai[i]; 2676 idx = 3*(*r++); 2677 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2678 while (nz--) { 2679 idx = 3*(*vi++); 2680 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2681 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2682 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2683 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2684 v += 9; 2685 } 2686 idx = 3*i; 2687 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2688 } 2689 /* backward solve the upper triangular */ 2690 for (i=n-1; i>=0; i--){ 2691 v = aa + 9*diag[i] + 9; 2692 vi = aj + diag[i] + 1; 2693 nz = ai[i+1] - diag[i] - 1; 2694 idt = 3*i; 2695 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2696 while (nz--) { 2697 idx = 3*(*vi++); 2698 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2699 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2700 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2701 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2702 v += 9; 2703 } 2704 idc = 3*(*c--); 2705 v = aa + 9*diag[i]; 2706 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2707 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2708 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2709 } 2710 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2711 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2712 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2713 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2714 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2715 PetscFunctionReturn(0); 2716 } 2717 2718 /* 2719 Special case where the matrix was ILU(0) factored in the natural 2720 ordering. This eliminates the need for the column and row permutation. 2721 */ 2722 #undef __FUNCT__ 2723 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2724 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2725 { 2726 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2727 int n=a->mbs,*ai=a->i,*aj=a->j; 2728 int ierr,*diag = a->diag; 2729 MatScalar *aa=a->a,*v; 2730 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 2731 int jdx,idt,idx,nz,*vi,i; 2732 2733 PetscFunctionBegin; 2734 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2735 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2736 2737 2738 /* forward solve the lower triangular */ 2739 idx = 0; 2740 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2741 for (i=1; i<n; i++) { 2742 v = aa + 9*ai[i]; 2743 vi = aj + ai[i]; 2744 nz = diag[i] - ai[i]; 2745 idx += 3; 2746 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2747 while (nz--) { 2748 jdx = 3*(*vi++); 2749 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2750 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2751 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2752 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2753 v += 9; 2754 } 2755 x[idx] = s1; 2756 x[1+idx] = s2; 2757 x[2+idx] = s3; 2758 } 2759 /* backward solve the upper triangular */ 2760 for (i=n-1; i>=0; i--){ 2761 v = aa + 9*diag[i] + 9; 2762 vi = aj + diag[i] + 1; 2763 nz = ai[i+1] - diag[i] - 1; 2764 idt = 3*i; 2765 s1 = x[idt]; s2 = x[1+idt]; 2766 s3 = x[2+idt]; 2767 while (nz--) { 2768 idx = 3*(*vi++); 2769 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2770 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2771 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2772 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2773 v += 9; 2774 } 2775 v = aa + 9*diag[i]; 2776 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2777 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2778 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2779 } 2780 2781 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2782 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2783 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2784 PetscFunctionReturn(0); 2785 } 2786 2787 #undef __FUNCT__ 2788 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2789 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2790 { 2791 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2792 IS iscol=a->col,isrow=a->row; 2793 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2794 int *diag = a->diag; 2795 MatScalar *aa=a->a,*v; 2796 PetscScalar *x,*b,s1,s2,x1,x2,*t; 2797 2798 PetscFunctionBegin; 2799 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2800 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2801 t = a->solve_work; 2802 2803 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2804 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2805 2806 /* forward solve the lower triangular */ 2807 idx = 2*(*r++); 2808 t[0] = b[idx]; t[1] = b[1+idx]; 2809 for (i=1; i<n; i++) { 2810 v = aa + 4*ai[i]; 2811 vi = aj + ai[i]; 2812 nz = diag[i] - ai[i]; 2813 idx = 2*(*r++); 2814 s1 = b[idx]; s2 = b[1+idx]; 2815 while (nz--) { 2816 idx = 2*(*vi++); 2817 x1 = t[idx]; x2 = t[1+idx]; 2818 s1 -= v[0]*x1 + v[2]*x2; 2819 s2 -= v[1]*x1 + v[3]*x2; 2820 v += 4; 2821 } 2822 idx = 2*i; 2823 t[idx] = s1; t[1+idx] = s2; 2824 } 2825 /* backward solve the upper triangular */ 2826 for (i=n-1; i>=0; i--){ 2827 v = aa + 4*diag[i] + 4; 2828 vi = aj + diag[i] + 1; 2829 nz = ai[i+1] - diag[i] - 1; 2830 idt = 2*i; 2831 s1 = t[idt]; s2 = t[1+idt]; 2832 while (nz--) { 2833 idx = 2*(*vi++); 2834 x1 = t[idx]; x2 = t[1+idx]; 2835 s1 -= v[0]*x1 + v[2]*x2; 2836 s2 -= v[1]*x1 + v[3]*x2; 2837 v += 4; 2838 } 2839 idc = 2*(*c--); 2840 v = aa + 4*diag[i]; 2841 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2842 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2843 } 2844 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2845 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2846 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2847 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2848 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2849 PetscFunctionReturn(0); 2850 } 2851 2852 /* 2853 Special case where the matrix was ILU(0) factored in the natural 2854 ordering. This eliminates the need for the column and row permutation. 2855 */ 2856 #undef __FUNCT__ 2857 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2858 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2859 { 2860 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2861 int n=a->mbs,*ai=a->i,*aj=a->j; 2862 int ierr,*diag = a->diag; 2863 MatScalar *aa=a->a,*v; 2864 PetscScalar *x,*b,s1,s2,x1,x2; 2865 int jdx,idt,idx,nz,*vi,i; 2866 2867 PetscFunctionBegin; 2868 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2869 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2870 2871 /* forward solve the lower triangular */ 2872 idx = 0; 2873 x[0] = b[0]; x[1] = b[1]; 2874 for (i=1; i<n; i++) { 2875 v = aa + 4*ai[i]; 2876 vi = aj + ai[i]; 2877 nz = diag[i] - ai[i]; 2878 idx += 2; 2879 s1 = b[idx];s2 = b[1+idx]; 2880 while (nz--) { 2881 jdx = 2*(*vi++); 2882 x1 = x[jdx];x2 = x[1+jdx]; 2883 s1 -= v[0]*x1 + v[2]*x2; 2884 s2 -= v[1]*x1 + v[3]*x2; 2885 v += 4; 2886 } 2887 x[idx] = s1; 2888 x[1+idx] = s2; 2889 } 2890 /* backward solve the upper triangular */ 2891 for (i=n-1; i>=0; i--){ 2892 v = aa + 4*diag[i] + 4; 2893 vi = aj + diag[i] + 1; 2894 nz = ai[i+1] - diag[i] - 1; 2895 idt = 2*i; 2896 s1 = x[idt]; s2 = x[1+idt]; 2897 while (nz--) { 2898 idx = 2*(*vi++); 2899 x1 = x[idx]; x2 = x[1+idx]; 2900 s1 -= v[0]*x1 + v[2]*x2; 2901 s2 -= v[1]*x1 + v[3]*x2; 2902 v += 4; 2903 } 2904 v = aa + 4*diag[i]; 2905 x[idt] = v[0]*s1 + v[2]*s2; 2906 x[1+idt] = v[1]*s1 + v[3]*s2; 2907 } 2908 2909 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2910 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2911 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2912 PetscFunctionReturn(0); 2913 } 2914 2915 #undef __FUNCT__ 2916 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2917 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2918 { 2919 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2920 IS iscol=a->col,isrow=a->row; 2921 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2922 int *diag = a->diag; 2923 MatScalar *aa=a->a,*v; 2924 PetscScalar *x,*b,s1,*t; 2925 2926 PetscFunctionBegin; 2927 if (!n) PetscFunctionReturn(0); 2928 2929 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2930 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2931 t = a->solve_work; 2932 2933 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2934 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2935 2936 /* forward solve the lower triangular */ 2937 t[0] = b[*r++]; 2938 for (i=1; i<n; i++) { 2939 v = aa + ai[i]; 2940 vi = aj + ai[i]; 2941 nz = diag[i] - ai[i]; 2942 s1 = b[*r++]; 2943 while (nz--) { 2944 s1 -= (*v++)*t[*vi++]; 2945 } 2946 t[i] = s1; 2947 } 2948 /* backward solve the upper triangular */ 2949 for (i=n-1; i>=0; i--){ 2950 v = aa + diag[i] + 1; 2951 vi = aj + diag[i] + 1; 2952 nz = ai[i+1] - diag[i] - 1; 2953 s1 = t[i]; 2954 while (nz--) { 2955 s1 -= (*v++)*t[*vi++]; 2956 } 2957 x[*c--] = t[i] = aa[diag[i]]*s1; 2958 } 2959 2960 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2961 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2962 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2963 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2964 PetscLogFlops(2*1*(a->nz) - A->n); 2965 PetscFunctionReturn(0); 2966 } 2967 /* 2968 Special case where the matrix was ILU(0) factored in the natural 2969 ordering. This eliminates the need for the column and row permutation. 2970 */ 2971 #undef __FUNCT__ 2972 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 2973 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2974 { 2975 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2976 int n=a->mbs,*ai=a->i,*aj=a->j; 2977 int ierr,*diag = a->diag; 2978 MatScalar *aa=a->a; 2979 PetscScalar *x,*b; 2980 PetscScalar s1,x1; 2981 MatScalar *v; 2982 int jdx,idt,idx,nz,*vi,i; 2983 2984 PetscFunctionBegin; 2985 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2986 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2987 2988 /* forward solve the lower triangular */ 2989 idx = 0; 2990 x[0] = b[0]; 2991 for (i=1; i<n; i++) { 2992 v = aa + ai[i]; 2993 vi = aj + ai[i]; 2994 nz = diag[i] - ai[i]; 2995 idx += 1; 2996 s1 = b[idx]; 2997 while (nz--) { 2998 jdx = *vi++; 2999 x1 = x[jdx]; 3000 s1 -= v[0]*x1; 3001 v += 1; 3002 } 3003 x[idx] = s1; 3004 } 3005 /* backward solve the upper triangular */ 3006 for (i=n-1; i>=0; i--){ 3007 v = aa + diag[i] + 1; 3008 vi = aj + diag[i] + 1; 3009 nz = ai[i+1] - diag[i] - 1; 3010 idt = i; 3011 s1 = x[idt]; 3012 while (nz--) { 3013 idx = *vi++; 3014 x1 = x[idx]; 3015 s1 -= v[0]*x1; 3016 v += 1; 3017 } 3018 v = aa + diag[i]; 3019 x[idt] = v[0]*s1; 3020 } 3021 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3022 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3023 PetscLogFlops(2*(a->nz) - A->n); 3024 PetscFunctionReturn(0); 3025 } 3026 3027 /* ----------------------------------------------------------------*/ 3028 /* 3029 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3030 except that the data structure of Mat_SeqAIJ is slightly different. 3031 Not a good example of code reuse. 3032 */ 3033 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat); 3034 3035 #undef __FUNCT__ 3036 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3037 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact) 3038 { 3039 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3040 IS isicol; 3041 int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j; 3042 int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3043 int *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0; 3044 int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 3045 PetscTruth col_identity,row_identity; 3046 PetscReal f; 3047 3048 PetscFunctionBegin; 3049 f = info->fill; 3050 levels = (int)info->levels; 3051 diagonal_fill = (int)info->diagonal_fill; 3052 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3053 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3054 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3055 3056 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3057 ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 3058 (*fact)->factor = FACTOR_LU; 3059 b = (Mat_SeqBAIJ*)(*fact)->data; 3060 if (!b->diag) { 3061 ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3062 } 3063 ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3064 b->row = isrow; 3065 b->col = iscol; 3066 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3067 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3068 b->icol = isicol; 3069 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3070 ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3071 } else { /* general case perform the symbolic factorization */ 3072 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3073 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3074 3075 /* get new row pointers */ 3076 ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 3077 ainew[0] = 0; 3078 /* don't know how many column pointers are needed so estimate */ 3079 jmax = (int)(f*ai[n] + 1); 3080 ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 3081 /* ajfill is level of fill for each fill entry */ 3082 ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 3083 /* fill is a linked list of nonzeros in active row */ 3084 ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 3085 /* im is level for each filled value */ 3086 ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 3087 /* dloc is location of diagonal in factor */ 3088 ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 3089 dloc[0] = 0; 3090 for (prow=0; prow<n; prow++) { 3091 3092 /* copy prow into linked list */ 3093 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3094 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 3095 xi = aj + ai[r[prow]]; 3096 fill[n] = n; 3097 fill[prow] = -1; /* marker for diagonal entry */ 3098 while (nz--) { 3099 fm = n; 3100 idx = ic[*xi++]; 3101 do { 3102 m = fm; 3103 fm = fill[m]; 3104 } while (fm < idx); 3105 fill[m] = idx; 3106 fill[idx] = fm; 3107 im[idx] = 0; 3108 } 3109 3110 /* make sure diagonal entry is included */ 3111 if (diagonal_fill && fill[prow] == -1) { 3112 fm = n; 3113 while (fill[fm] < prow) fm = fill[fm]; 3114 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3115 fill[fm] = prow; 3116 im[prow] = 0; 3117 nzf++; 3118 dcount++; 3119 } 3120 3121 nzi = 0; 3122 row = fill[n]; 3123 while (row < prow) { 3124 incrlev = im[row] + 1; 3125 nz = dloc[row]; 3126 xi = ajnew + ainew[row] + nz + 1; 3127 flev = ajfill + ainew[row] + nz + 1; 3128 nnz = ainew[row+1] - ainew[row] - nz - 1; 3129 fm = row; 3130 while (nnz-- > 0) { 3131 idx = *xi++; 3132 if (*flev + incrlev > levels) { 3133 flev++; 3134 continue; 3135 } 3136 do { 3137 m = fm; 3138 fm = fill[m]; 3139 } while (fm < idx); 3140 if (fm != idx) { 3141 im[idx] = *flev + incrlev; 3142 fill[m] = idx; 3143 fill[idx] = fm; 3144 fm = idx; 3145 nzf++; 3146 } else { 3147 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3148 } 3149 flev++; 3150 } 3151 row = fill[row]; 3152 nzi++; 3153 } 3154 /* copy new filled row into permanent storage */ 3155 ainew[prow+1] = ainew[prow] + nzf; 3156 if (ainew[prow+1] > jmax) { 3157 3158 /* estimate how much additional space we will need */ 3159 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3160 /* just double the memory each time */ 3161 int maxadd = jmax; 3162 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3163 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3164 jmax += maxadd; 3165 3166 /* allocate a longer ajnew and ajfill */ 3167 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3168 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3169 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3170 ajnew = xi; 3171 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3172 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3173 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3174 ajfill = xi; 3175 reallocate++; /* count how many reallocations are needed */ 3176 } 3177 xi = ajnew + ainew[prow]; 3178 flev = ajfill + ainew[prow]; 3179 dloc[prow] = nzi; 3180 fm = fill[n]; 3181 while (nzf--) { 3182 *xi++ = fm; 3183 *flev++ = im[fm]; 3184 fm = fill[fm]; 3185 } 3186 /* make sure row has diagonal entry */ 3187 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3188 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 3189 try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 3190 } 3191 } 3192 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3193 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3194 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3195 ierr = PetscFree(fill);CHKERRQ(ierr); 3196 ierr = PetscFree(im);CHKERRQ(ierr); 3197 3198 { 3199 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3200 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af); 3201 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 3202 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 3203 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 3204 if (diagonal_fill) { 3205 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount); 3206 } 3207 } 3208 3209 /* put together the new matrix */ 3210 ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr); 3211 PetscLogObjectParent(*fact,isicol); 3212 b = (Mat_SeqBAIJ*)(*fact)->data; 3213 ierr = PetscFree(b->imax);CHKERRQ(ierr); 3214 b->singlemalloc = PETSC_FALSE; 3215 len = bs2*ainew[n]*sizeof(MatScalar); 3216 /* the next line frees the default space generated by the Create() */ 3217 ierr = PetscFree(b->a);CHKERRQ(ierr); 3218 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 3219 ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 3220 b->j = ajnew; 3221 b->i = ainew; 3222 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3223 b->diag = dloc; 3224 b->ilen = 0; 3225 b->imax = 0; 3226 b->row = isrow; 3227 b->col = iscol; 3228 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3229 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3230 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3231 b->icol = isicol; 3232 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3233 /* In b structure: Free imax, ilen, old a, old j. 3234 Allocate dloc, solve_work, new a, new j */ 3235 PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar)); 3236 b->maxnz = b->nz = ainew[n]; 3237 (*fact)->factor = FACTOR_LU; 3238 3239 (*fact)->info.factor_mallocs = reallocate; 3240 (*fact)->info.fill_ratio_given = f; 3241 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3242 } 3243 3244 if (row_identity && col_identity) { 3245 ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 3246 } 3247 PetscFunctionReturn(0); 3248 } 3249 3250 #undef __FUNCT__ 3251 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3252 int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3253 { 3254 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3255 /* int i,*AJ=a->j,nz=a->nz; */ 3256 PetscFunctionBegin; 3257 /* Undo Column scaling */ 3258 /* while (nz--) { */ 3259 /* AJ[i] = AJ[i]/4; */ 3260 /* } */ 3261 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3262 A->ops->setunfactored = PETSC_NULL; 3263 PetscFunctionReturn(0); 3264 } 3265 3266 #undef __FUNCT__ 3267 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3268 int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3269 { 3270 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3271 int *AJ=a->j,nz=a->nz; 3272 unsigned short *aj=(unsigned short *)AJ; 3273 PetscFunctionBegin; 3274 /* Is this really necessary? */ 3275 while (nz--) { 3276 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3277 } 3278 A->ops->setunfactored = PETSC_NULL; 3279 PetscFunctionReturn(0); 3280 } 3281 3282 #undef __FUNCT__ 3283 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3284 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 3285 { 3286 /* 3287 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3288 with natural ordering 3289 */ 3290 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3291 3292 PetscFunctionBegin; 3293 inA->ops->solve = MatSolve_SeqBAIJ_Update; 3294 inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 3295 switch (a->bs) { 3296 case 1: 3297 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3298 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"); 3299 break; 3300 case 2: 3301 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3302 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"); 3303 break; 3304 case 3: 3305 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3306 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"); 3307 break; 3308 case 4: 3309 #if defined(PETSC_USE_MAT_SINGLE) 3310 { 3311 PetscTruth sse_enabled_local; 3312 int ierr; 3313 ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3314 if (sse_enabled_local) { 3315 # if defined(PETSC_HAVE_SSE) 3316 int i,*AJ=a->j,nz=a->nz,n=a->mbs; 3317 if (n==(unsigned short)n) { 3318 unsigned short *aj=(unsigned short *)AJ; 3319 for (i=0;i<nz;i++) { 3320 aj[i] = (unsigned short)AJ[i]; 3321 } 3322 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3323 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3324 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n"); 3325 } else { 3326 /* Scale the column indices for easier indexing in MatSolve. */ 3327 /* for (i=0;i<nz;i++) { */ 3328 /* AJ[i] = AJ[i]*4; */ 3329 /* } */ 3330 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE; 3331 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 3332 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n"); 3333 } 3334 # else 3335 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3336 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3337 # endif 3338 } else { 3339 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3340 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3341 } 3342 } 3343 #else 3344 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3345 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3346 #endif 3347 break; 3348 case 5: 3349 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3350 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"); 3351 break; 3352 case 6: 3353 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3354 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"); 3355 break; 3356 case 7: 3357 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3358 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"); 3359 break; 3360 } 3361 PetscFunctionReturn(0); 3362 } 3363 3364 #undef __FUNCT__ 3365 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3366 int MatSeqBAIJ_UpdateSolvers(Mat A) 3367 { 3368 /* 3369 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3370 with natural ordering 3371 */ 3372 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3373 IS row = a->row, col = a->col; 3374 PetscTruth row_identity, col_identity; 3375 PetscTruth use_natural; 3376 int ierr; 3377 3378 PetscFunctionBegin; 3379 3380 use_natural = PETSC_FALSE; 3381 3382 ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3383 ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3384 3385 if (row_identity && col_identity) { 3386 use_natural = PETSC_TRUE; 3387 } else { 3388 use_natural = PETSC_FALSE; 3389 } 3390 switch (a->bs) { 3391 case 1: 3392 if (use_natural) { 3393 A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3394 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3395 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"); 3396 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3397 } else { 3398 A->ops->solve = MatSolve_SeqBAIJ_1; 3399 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3400 } 3401 break; 3402 case 2: 3403 if (use_natural) { 3404 A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3405 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3406 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"); 3407 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3408 } else { 3409 A->ops->solve = MatSolve_SeqBAIJ_2; 3410 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3411 } 3412 break; 3413 case 3: 3414 if (use_natural) { 3415 A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3416 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3417 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"); 3418 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3419 } else { 3420 A->ops->solve = MatSolve_SeqBAIJ_3; 3421 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3422 } 3423 break; 3424 case 4: 3425 { 3426 PetscTruth sse_enabled_local; 3427 ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3428 if (use_natural) { 3429 #if defined(PETSC_USE_MAT_SINGLE) 3430 if (sse_enabled_local) { /* Natural + Single + SSE */ 3431 # if defined(PETSC_HAVE_SSE) 3432 int n=a->mbs; 3433 if (n==(unsigned short)n) { 3434 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj; 3435 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n"); 3436 } else { 3437 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 3438 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n"); 3439 } 3440 # else 3441 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3442 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3443 # endif 3444 } else { /* Natural + Single */ 3445 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3446 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n"); 3447 } 3448 #else 3449 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3450 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3451 #endif 3452 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3453 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3454 } else { /* Arbitrary ordering */ 3455 #if defined(PETSC_USE_MAT_SINGLE) 3456 if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3457 # if defined(PETSC_HAVE_SSE) 3458 A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3459 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"); 3460 # else 3461 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3462 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3463 # endif 3464 } else { /* Arbitrary + Single */ 3465 A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3466 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n"); 3467 } 3468 #else 3469 A->ops->solve = MatSolve_SeqBAIJ_4; 3470 #endif 3471 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3472 } 3473 } 3474 break; 3475 case 5: 3476 if (use_natural) { 3477 A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3478 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3479 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3480 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3481 } else { 3482 A->ops->solve = MatSolve_SeqBAIJ_5; 3483 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3484 } 3485 break; 3486 case 6: 3487 if (use_natural) { 3488 A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3489 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3490 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3491 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3492 } else { 3493 A->ops->solve = MatSolve_SeqBAIJ_6; 3494 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3495 } 3496 break; 3497 case 7: 3498 if (use_natural) { 3499 A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3500 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3501 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3502 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3503 } else { 3504 A->ops->solve = MatSolve_SeqBAIJ_7; 3505 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3506 } 3507 break; 3508 default: 3509 A->ops->solve = MatSolve_SeqBAIJ_N; 3510 break; 3511 } 3512 PetscFunctionReturn(0); 3513 } 3514 3515 #undef __FUNCT__ 3516 #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3517 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3518 int ierr; 3519 3520 PetscFunctionBegin; 3521 ierr = MatSeqBAIJ_UpdateSolvers(A); 3522 if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3523 ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3524 } else { 3525 SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3526 } 3527 PetscFunctionReturn(0); 3528 } 3529 3530 #undef __FUNCT__ 3531 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3532 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3533 int ierr; 3534 3535 PetscFunctionBegin; 3536 ierr = MatSeqBAIJ_UpdateSolvers(A); 3537 ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3538 PetscFunctionReturn(0); 3539 } 3540 3541 3542