1 /* 2 Factorization code for BAIJ format. 3 */ 4 5 #include "src/mat/impls/baij/seq/baij.h" 6 #include "src/inline/ilu.h" 7 #include "src/inline/dot.h" 8 9 #undef __FUNCT__ 10 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 11 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 12 { 13 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 14 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 15 int *diag = a->diag; 16 MatScalar *aa=a->a,*v; 17 PetscScalar s1,*x,*b; 18 19 PetscFunctionBegin; 20 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 21 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 22 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 23 24 /* forward solve the U^T */ 25 for (i=0; i<n; i++) { 26 27 v = aa + diag[i]; 28 /* multiply by the inverse of the block diagonal */ 29 s1 = (*v++)*x[i]; 30 vi = aj + diag[i] + 1; 31 nz = ai[i+1] - diag[i] - 1; 32 while (nz--) { 33 x[*vi++] -= (*v++)*s1; 34 } 35 x[i] = s1; 36 } 37 /* backward solve the L^T */ 38 for (i=n-1; i>=0; i--){ 39 v = aa + diag[i] - 1; 40 vi = aj + diag[i] - 1; 41 nz = diag[i] - ai[i]; 42 s1 = x[i]; 43 while (nz--) { 44 x[*vi--] -= (*v--)*s1; 45 } 46 } 47 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 48 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 49 PetscLogFlops(2*(a->nz) - A->n); 50 PetscFunctionReturn(0); 51 } 52 53 #undef __FUNCT__ 54 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 55 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 56 { 57 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 58 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 59 int *diag = a->diag,oidx; 60 MatScalar *aa=a->a,*v; 61 PetscScalar s1,s2,x1,x2; 62 PetscScalar *x,*b; 63 64 PetscFunctionBegin; 65 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 66 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 67 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 68 69 /* forward solve the U^T */ 70 idx = 0; 71 for (i=0; i<n; i++) { 72 73 v = aa + 4*diag[i]; 74 /* multiply by the inverse of the block diagonal */ 75 x1 = x[idx]; x2 = x[1+idx]; 76 s1 = v[0]*x1 + v[1]*x2; 77 s2 = v[2]*x1 + v[3]*x2; 78 v += 4; 79 80 vi = aj + diag[i] + 1; 81 nz = ai[i+1] - diag[i] - 1; 82 while (nz--) { 83 oidx = 2*(*vi++); 84 x[oidx] -= v[0]*s1 + v[1]*s2; 85 x[oidx+1] -= v[2]*s1 + v[3]*s2; 86 v += 4; 87 } 88 x[idx] = s1;x[1+idx] = s2; 89 idx += 2; 90 } 91 /* backward solve the L^T */ 92 for (i=n-1; i>=0; i--){ 93 v = aa + 4*diag[i] - 4; 94 vi = aj + diag[i] - 1; 95 nz = diag[i] - ai[i]; 96 idt = 2*i; 97 s1 = x[idt]; s2 = x[1+idt]; 98 while (nz--) { 99 idx = 2*(*vi--); 100 x[idx] -= v[0]*s1 + v[1]*s2; 101 x[idx+1] -= v[2]*s1 + v[3]*s2; 102 v -= 4; 103 } 104 } 105 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 106 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 107 PetscLogFlops(2*4*(a->nz) - 2*A->n); 108 PetscFunctionReturn(0); 109 } 110 111 #undef __FUNCT__ 112 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 113 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 114 { 115 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 116 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 117 int *diag = a->diag,oidx; 118 MatScalar *aa=a->a,*v; 119 PetscScalar s1,s2,s3,x1,x2,x3; 120 PetscScalar *x,*b; 121 122 PetscFunctionBegin; 123 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 124 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 125 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 126 127 /* forward solve the U^T */ 128 idx = 0; 129 for (i=0; i<n; i++) { 130 131 v = aa + 9*diag[i]; 132 /* multiply by the inverse of the block diagonal */ 133 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 134 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 135 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 136 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 137 v += 9; 138 139 vi = aj + diag[i] + 1; 140 nz = ai[i+1] - diag[i] - 1; 141 while (nz--) { 142 oidx = 3*(*vi++); 143 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 144 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 145 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 146 v += 9; 147 } 148 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 149 idx += 3; 150 } 151 /* backward solve the L^T */ 152 for (i=n-1; i>=0; i--){ 153 v = aa + 9*diag[i] - 9; 154 vi = aj + diag[i] - 1; 155 nz = diag[i] - ai[i]; 156 idt = 3*i; 157 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 158 while (nz--) { 159 idx = 3*(*vi--); 160 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 161 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 162 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 163 v -= 9; 164 } 165 } 166 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 168 PetscLogFlops(2*9*(a->nz) - 3*A->n); 169 PetscFunctionReturn(0); 170 } 171 172 #undef __FUNCT__ 173 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 174 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 175 { 176 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 177 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 178 int *diag = a->diag,oidx; 179 MatScalar *aa=a->a,*v; 180 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 181 PetscScalar *x,*b; 182 183 PetscFunctionBegin; 184 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 185 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 186 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 187 188 /* forward solve the U^T */ 189 idx = 0; 190 for (i=0; i<n; i++) { 191 192 v = aa + 16*diag[i]; 193 /* multiply by the inverse of the block diagonal */ 194 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 195 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 196 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 197 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 198 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 199 v += 16; 200 201 vi = aj + diag[i] + 1; 202 nz = ai[i+1] - diag[i] - 1; 203 while (nz--) { 204 oidx = 4*(*vi++); 205 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 206 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 207 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 208 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 209 v += 16; 210 } 211 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 212 idx += 4; 213 } 214 /* backward solve the L^T */ 215 for (i=n-1; i>=0; i--){ 216 v = aa + 16*diag[i] - 16; 217 vi = aj + diag[i] - 1; 218 nz = diag[i] - ai[i]; 219 idt = 4*i; 220 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 221 while (nz--) { 222 idx = 4*(*vi--); 223 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 224 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 225 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 226 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 227 v -= 16; 228 } 229 } 230 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 231 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 232 PetscLogFlops(2*16*(a->nz) - 4*A->n); 233 PetscFunctionReturn(0); 234 } 235 236 #undef __FUNCT__ 237 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 238 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 239 { 240 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 241 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 242 int *diag = a->diag,oidx; 243 MatScalar *aa=a->a,*v; 244 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 245 PetscScalar *x,*b; 246 247 PetscFunctionBegin; 248 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 249 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 250 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 251 252 /* forward solve the U^T */ 253 idx = 0; 254 for (i=0; i<n; i++) { 255 256 v = aa + 25*diag[i]; 257 /* multiply by the inverse of the block diagonal */ 258 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 259 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 260 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 261 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 262 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 263 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 264 v += 25; 265 266 vi = aj + diag[i] + 1; 267 nz = ai[i+1] - diag[i] - 1; 268 while (nz--) { 269 oidx = 5*(*vi++); 270 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 271 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 272 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 273 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 274 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 275 v += 25; 276 } 277 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 278 idx += 5; 279 } 280 /* backward solve the L^T */ 281 for (i=n-1; i>=0; i--){ 282 v = aa + 25*diag[i] - 25; 283 vi = aj + diag[i] - 1; 284 nz = diag[i] - ai[i]; 285 idt = 5*i; 286 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 287 while (nz--) { 288 idx = 5*(*vi--); 289 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 290 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 291 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 292 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 293 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 294 v -= 25; 295 } 296 } 297 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 298 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 299 PetscLogFlops(2*25*(a->nz) - 5*A->n); 300 PetscFunctionReturn(0); 301 } 302 303 #undef __FUNCT__ 304 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 305 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 306 { 307 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 308 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 309 int *diag = a->diag,oidx; 310 MatScalar *aa=a->a,*v; 311 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 312 PetscScalar *x,*b; 313 314 PetscFunctionBegin; 315 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 316 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 317 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 318 319 /* forward solve the U^T */ 320 idx = 0; 321 for (i=0; i<n; i++) { 322 323 v = aa + 36*diag[i]; 324 /* multiply by the inverse of the block diagonal */ 325 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 326 x6 = x[5+idx]; 327 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 328 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 329 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 330 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 331 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 332 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 333 v += 36; 334 335 vi = aj + diag[i] + 1; 336 nz = ai[i+1] - diag[i] - 1; 337 while (nz--) { 338 oidx = 6*(*vi++); 339 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 340 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 341 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 342 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 343 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 344 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 345 v += 36; 346 } 347 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 348 x[5+idx] = s6; 349 idx += 6; 350 } 351 /* backward solve the L^T */ 352 for (i=n-1; i>=0; i--){ 353 v = aa + 36*diag[i] - 36; 354 vi = aj + diag[i] - 1; 355 nz = diag[i] - ai[i]; 356 idt = 6*i; 357 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 358 s6 = x[5+idt]; 359 while (nz--) { 360 idx = 6*(*vi--); 361 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 362 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 363 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 364 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 365 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 366 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 367 v -= 36; 368 } 369 } 370 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 371 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 372 PetscLogFlops(2*36*(a->nz) - 6*A->n); 373 PetscFunctionReturn(0); 374 } 375 376 #undef __FUNCT__ 377 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 378 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 379 { 380 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 381 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 382 int *diag = a->diag,oidx; 383 MatScalar *aa=a->a,*v; 384 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 385 PetscScalar *x,*b; 386 387 PetscFunctionBegin; 388 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 389 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 390 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 391 392 /* forward solve the U^T */ 393 idx = 0; 394 for (i=0; i<n; i++) { 395 396 v = aa + 49*diag[i]; 397 /* multiply by the inverse of the block diagonal */ 398 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 399 x6 = x[5+idx]; x7 = x[6+idx]; 400 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 401 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 402 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 403 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 404 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 405 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 406 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 407 v += 49; 408 409 vi = aj + diag[i] + 1; 410 nz = ai[i+1] - diag[i] - 1; 411 while (nz--) { 412 oidx = 7*(*vi++); 413 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 414 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 415 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 416 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 417 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 418 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 419 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 420 v += 49; 421 } 422 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 423 x[5+idx] = s6;x[6+idx] = s7; 424 idx += 7; 425 } 426 /* backward solve the L^T */ 427 for (i=n-1; i>=0; i--){ 428 v = aa + 49*diag[i] - 49; 429 vi = aj + diag[i] - 1; 430 nz = diag[i] - ai[i]; 431 idt = 7*i; 432 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 433 s6 = x[5+idt];s7 = x[6+idt]; 434 while (nz--) { 435 idx = 7*(*vi--); 436 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 437 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 438 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 439 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 440 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 441 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 442 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 443 v -= 49; 444 } 445 } 446 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 447 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 448 PetscLogFlops(2*49*(a->nz) - 7*A->n); 449 PetscFunctionReturn(0); 450 } 451 452 /*---------------------------------------------------------------------------------------------*/ 453 #undef __FUNCT__ 454 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 455 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 456 { 457 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 458 IS iscol=a->col,isrow=a->row; 459 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 460 int *diag = a->diag; 461 MatScalar *aa=a->a,*v; 462 PetscScalar s1,*x,*b,*t; 463 464 PetscFunctionBegin; 465 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 466 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 467 t = a->solve_work; 468 469 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 470 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 471 472 /* copy the b into temp work space according to permutation */ 473 for (i=0; i<n; i++) { 474 t[i] = b[c[i]]; 475 } 476 477 /* forward solve the U^T */ 478 for (i=0; i<n; i++) { 479 480 v = aa + diag[i]; 481 /* multiply by the inverse of the block diagonal */ 482 s1 = (*v++)*t[i]; 483 vi = aj + diag[i] + 1; 484 nz = ai[i+1] - diag[i] - 1; 485 while (nz--) { 486 t[*vi++] -= (*v++)*s1; 487 } 488 t[i] = s1; 489 } 490 /* backward solve the L^T */ 491 for (i=n-1; i>=0; i--){ 492 v = aa + diag[i] - 1; 493 vi = aj + diag[i] - 1; 494 nz = diag[i] - ai[i]; 495 s1 = t[i]; 496 while (nz--) { 497 t[*vi--] -= (*v--)*s1; 498 } 499 } 500 501 /* copy t into x according to permutation */ 502 for (i=0; i<n; i++) { 503 x[r[i]] = t[i]; 504 } 505 506 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 507 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 508 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 509 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 510 PetscLogFlops(2*(a->nz) - A->n); 511 PetscFunctionReturn(0); 512 } 513 514 #undef __FUNCT__ 515 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 516 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 517 { 518 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 519 IS iscol=a->col,isrow=a->row; 520 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 521 int *diag = a->diag,ii,ic,ir,oidx; 522 MatScalar *aa=a->a,*v; 523 PetscScalar s1,s2,x1,x2; 524 PetscScalar *x,*b,*t; 525 526 PetscFunctionBegin; 527 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 528 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 529 t = a->solve_work; 530 531 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 532 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 533 534 /* copy the b into temp work space according to permutation */ 535 ii = 0; 536 for (i=0; i<n; i++) { 537 ic = 2*c[i]; 538 t[ii] = b[ic]; 539 t[ii+1] = b[ic+1]; 540 ii += 2; 541 } 542 543 /* forward solve the U^T */ 544 idx = 0; 545 for (i=0; i<n; i++) { 546 547 v = aa + 4*diag[i]; 548 /* multiply by the inverse of the block diagonal */ 549 x1 = t[idx]; x2 = t[1+idx]; 550 s1 = v[0]*x1 + v[1]*x2; 551 s2 = v[2]*x1 + v[3]*x2; 552 v += 4; 553 554 vi = aj + diag[i] + 1; 555 nz = ai[i+1] - diag[i] - 1; 556 while (nz--) { 557 oidx = 2*(*vi++); 558 t[oidx] -= v[0]*s1 + v[1]*s2; 559 t[oidx+1] -= v[2]*s1 + v[3]*s2; 560 v += 4; 561 } 562 t[idx] = s1;t[1+idx] = s2; 563 idx += 2; 564 } 565 /* backward solve the L^T */ 566 for (i=n-1; i>=0; i--){ 567 v = aa + 4*diag[i] - 4; 568 vi = aj + diag[i] - 1; 569 nz = diag[i] - ai[i]; 570 idt = 2*i; 571 s1 = t[idt]; s2 = t[1+idt]; 572 while (nz--) { 573 idx = 2*(*vi--); 574 t[idx] -= v[0]*s1 + v[1]*s2; 575 t[idx+1] -= v[2]*s1 + v[3]*s2; 576 v -= 4; 577 } 578 } 579 580 /* copy t into x according to permutation */ 581 ii = 0; 582 for (i=0; i<n; i++) { 583 ir = 2*r[i]; 584 x[ir] = t[ii]; 585 x[ir+1] = t[ii+1]; 586 ii += 2; 587 } 588 589 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 590 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 591 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 592 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 593 PetscLogFlops(2*4*(a->nz) - 2*A->n); 594 PetscFunctionReturn(0); 595 } 596 597 #undef __FUNCT__ 598 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 599 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 600 { 601 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 602 IS iscol=a->col,isrow=a->row; 603 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 604 int *diag = a->diag,ii,ic,ir,oidx; 605 MatScalar *aa=a->a,*v; 606 PetscScalar s1,s2,s3,x1,x2,x3; 607 PetscScalar *x,*b,*t; 608 609 PetscFunctionBegin; 610 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 611 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 612 t = a->solve_work; 613 614 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 615 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 616 617 /* copy the b into temp work space according to permutation */ 618 ii = 0; 619 for (i=0; i<n; i++) { 620 ic = 3*c[i]; 621 t[ii] = b[ic]; 622 t[ii+1] = b[ic+1]; 623 t[ii+2] = b[ic+2]; 624 ii += 3; 625 } 626 627 /* forward solve the U^T */ 628 idx = 0; 629 for (i=0; i<n; i++) { 630 631 v = aa + 9*diag[i]; 632 /* multiply by the inverse of the block diagonal */ 633 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 634 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 635 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 636 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 637 v += 9; 638 639 vi = aj + diag[i] + 1; 640 nz = ai[i+1] - diag[i] - 1; 641 while (nz--) { 642 oidx = 3*(*vi++); 643 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 644 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 645 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 646 v += 9; 647 } 648 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 649 idx += 3; 650 } 651 /* backward solve the L^T */ 652 for (i=n-1; i>=0; i--){ 653 v = aa + 9*diag[i] - 9; 654 vi = aj + diag[i] - 1; 655 nz = diag[i] - ai[i]; 656 idt = 3*i; 657 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 658 while (nz--) { 659 idx = 3*(*vi--); 660 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 661 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 662 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 663 v -= 9; 664 } 665 } 666 667 /* copy t into x according to permutation */ 668 ii = 0; 669 for (i=0; i<n; i++) { 670 ir = 3*r[i]; 671 x[ir] = t[ii]; 672 x[ir+1] = t[ii+1]; 673 x[ir+2] = t[ii+2]; 674 ii += 3; 675 } 676 677 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 678 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 679 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 680 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 681 PetscLogFlops(2*9*(a->nz) - 3*A->n); 682 PetscFunctionReturn(0); 683 } 684 685 #undef __FUNCT__ 686 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 687 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 688 { 689 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 690 IS iscol=a->col,isrow=a->row; 691 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 692 int *diag = a->diag,ii,ic,ir,oidx; 693 MatScalar *aa=a->a,*v; 694 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 695 PetscScalar *x,*b,*t; 696 697 PetscFunctionBegin; 698 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 699 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 700 t = a->solve_work; 701 702 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 703 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 704 705 /* copy the b into temp work space according to permutation */ 706 ii = 0; 707 for (i=0; i<n; i++) { 708 ic = 4*c[i]; 709 t[ii] = b[ic]; 710 t[ii+1] = b[ic+1]; 711 t[ii+2] = b[ic+2]; 712 t[ii+3] = b[ic+3]; 713 ii += 4; 714 } 715 716 /* forward solve the U^T */ 717 idx = 0; 718 for (i=0; i<n; i++) { 719 720 v = aa + 16*diag[i]; 721 /* multiply by the inverse of the block diagonal */ 722 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 723 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 724 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 725 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 726 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 727 v += 16; 728 729 vi = aj + diag[i] + 1; 730 nz = ai[i+1] - diag[i] - 1; 731 while (nz--) { 732 oidx = 4*(*vi++); 733 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 734 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 735 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 736 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 737 v += 16; 738 } 739 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 740 idx += 4; 741 } 742 /* backward solve the L^T */ 743 for (i=n-1; i>=0; i--){ 744 v = aa + 16*diag[i] - 16; 745 vi = aj + diag[i] - 1; 746 nz = diag[i] - ai[i]; 747 idt = 4*i; 748 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 749 while (nz--) { 750 idx = 4*(*vi--); 751 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 752 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 753 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 754 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 755 v -= 16; 756 } 757 } 758 759 /* copy t into x according to permutation */ 760 ii = 0; 761 for (i=0; i<n; i++) { 762 ir = 4*r[i]; 763 x[ir] = t[ii]; 764 x[ir+1] = t[ii+1]; 765 x[ir+2] = t[ii+2]; 766 x[ir+3] = t[ii+3]; 767 ii += 4; 768 } 769 770 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 771 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 772 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 773 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 774 PetscLogFlops(2*16*(a->nz) - 4*A->n); 775 PetscFunctionReturn(0); 776 } 777 778 #undef __FUNCT__ 779 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 780 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 781 { 782 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 783 IS iscol=a->col,isrow=a->row; 784 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 785 int *diag = a->diag,ii,ic,ir,oidx; 786 MatScalar *aa=a->a,*v; 787 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 788 PetscScalar *x,*b,*t; 789 790 PetscFunctionBegin; 791 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 792 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 793 t = a->solve_work; 794 795 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 796 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 797 798 /* copy the b into temp work space according to permutation */ 799 ii = 0; 800 for (i=0; i<n; i++) { 801 ic = 5*c[i]; 802 t[ii] = b[ic]; 803 t[ii+1] = b[ic+1]; 804 t[ii+2] = b[ic+2]; 805 t[ii+3] = b[ic+3]; 806 t[ii+4] = b[ic+4]; 807 ii += 5; 808 } 809 810 /* forward solve the U^T */ 811 idx = 0; 812 for (i=0; i<n; i++) { 813 814 v = aa + 25*diag[i]; 815 /* multiply by the inverse of the block diagonal */ 816 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 817 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 818 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 819 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 820 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 821 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 822 v += 25; 823 824 vi = aj + diag[i] + 1; 825 nz = ai[i+1] - diag[i] - 1; 826 while (nz--) { 827 oidx = 5*(*vi++); 828 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 829 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 830 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 831 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 832 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 833 v += 25; 834 } 835 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 836 idx += 5; 837 } 838 /* backward solve the L^T */ 839 for (i=n-1; i>=0; i--){ 840 v = aa + 25*diag[i] - 25; 841 vi = aj + diag[i] - 1; 842 nz = diag[i] - ai[i]; 843 idt = 5*i; 844 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 845 while (nz--) { 846 idx = 5*(*vi--); 847 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 848 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 849 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 850 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 851 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 852 v -= 25; 853 } 854 } 855 856 /* copy t into x according to permutation */ 857 ii = 0; 858 for (i=0; i<n; i++) { 859 ir = 5*r[i]; 860 x[ir] = t[ii]; 861 x[ir+1] = t[ii+1]; 862 x[ir+2] = t[ii+2]; 863 x[ir+3] = t[ii+3]; 864 x[ir+4] = t[ii+4]; 865 ii += 5; 866 } 867 868 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 869 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 870 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 871 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 872 PetscLogFlops(2*25*(a->nz) - 5*A->n); 873 PetscFunctionReturn(0); 874 } 875 876 #undef __FUNCT__ 877 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 878 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 879 { 880 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 881 IS iscol=a->col,isrow=a->row; 882 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 883 int *diag = a->diag,ii,ic,ir,oidx; 884 MatScalar *aa=a->a,*v; 885 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 886 PetscScalar *x,*b,*t; 887 888 PetscFunctionBegin; 889 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 890 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 891 t = a->solve_work; 892 893 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 894 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 895 896 /* copy the b into temp work space according to permutation */ 897 ii = 0; 898 for (i=0; i<n; i++) { 899 ic = 6*c[i]; 900 t[ii] = b[ic]; 901 t[ii+1] = b[ic+1]; 902 t[ii+2] = b[ic+2]; 903 t[ii+3] = b[ic+3]; 904 t[ii+4] = b[ic+4]; 905 t[ii+5] = b[ic+5]; 906 ii += 6; 907 } 908 909 /* forward solve the U^T */ 910 idx = 0; 911 for (i=0; i<n; i++) { 912 913 v = aa + 36*diag[i]; 914 /* multiply by the inverse of the block diagonal */ 915 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 916 x6 = t[5+idx]; 917 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 918 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 919 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 920 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 921 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 922 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 923 v += 36; 924 925 vi = aj + diag[i] + 1; 926 nz = ai[i+1] - diag[i] - 1; 927 while (nz--) { 928 oidx = 6*(*vi++); 929 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 930 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 931 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 932 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 933 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 934 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 935 v += 36; 936 } 937 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 938 t[5+idx] = s6; 939 idx += 6; 940 } 941 /* backward solve the L^T */ 942 for (i=n-1; i>=0; i--){ 943 v = aa + 36*diag[i] - 36; 944 vi = aj + diag[i] - 1; 945 nz = diag[i] - ai[i]; 946 idt = 6*i; 947 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 948 s6 = t[5+idt]; 949 while (nz--) { 950 idx = 6*(*vi--); 951 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 952 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 953 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 954 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 955 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 956 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 957 v -= 36; 958 } 959 } 960 961 /* copy t into x according to permutation */ 962 ii = 0; 963 for (i=0; i<n; i++) { 964 ir = 6*r[i]; 965 x[ir] = t[ii]; 966 x[ir+1] = t[ii+1]; 967 x[ir+2] = t[ii+2]; 968 x[ir+3] = t[ii+3]; 969 x[ir+4] = t[ii+4]; 970 x[ir+5] = t[ii+5]; 971 ii += 6; 972 } 973 974 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 975 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 976 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 977 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 978 PetscLogFlops(2*36*(a->nz) - 6*A->n); 979 PetscFunctionReturn(0); 980 } 981 982 #undef __FUNCT__ 983 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 984 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 985 { 986 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 987 IS iscol=a->col,isrow=a->row; 988 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 989 int *diag = a->diag,ii,ic,ir,oidx; 990 MatScalar *aa=a->a,*v; 991 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 992 PetscScalar *x,*b,*t; 993 994 PetscFunctionBegin; 995 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 996 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 997 t = a->solve_work; 998 999 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1000 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1001 1002 /* copy the b into temp work space according to permutation */ 1003 ii = 0; 1004 for (i=0; i<n; i++) { 1005 ic = 7*c[i]; 1006 t[ii] = b[ic]; 1007 t[ii+1] = b[ic+1]; 1008 t[ii+2] = b[ic+2]; 1009 t[ii+3] = b[ic+3]; 1010 t[ii+4] = b[ic+4]; 1011 t[ii+5] = b[ic+5]; 1012 t[ii+6] = b[ic+6]; 1013 ii += 7; 1014 } 1015 1016 /* forward solve the U^T */ 1017 idx = 0; 1018 for (i=0; i<n; i++) { 1019 1020 v = aa + 49*diag[i]; 1021 /* multiply by the inverse of the block diagonal */ 1022 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1023 x6 = t[5+idx]; x7 = t[6+idx]; 1024 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1025 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1026 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1027 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1028 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1029 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1030 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1031 v += 49; 1032 1033 vi = aj + diag[i] + 1; 1034 nz = ai[i+1] - diag[i] - 1; 1035 while (nz--) { 1036 oidx = 7*(*vi++); 1037 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1038 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1039 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1040 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1041 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1042 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1043 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1044 v += 49; 1045 } 1046 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1047 t[5+idx] = s6;t[6+idx] = s7; 1048 idx += 7; 1049 } 1050 /* backward solve the L^T */ 1051 for (i=n-1; i>=0; i--){ 1052 v = aa + 49*diag[i] - 49; 1053 vi = aj + diag[i] - 1; 1054 nz = diag[i] - ai[i]; 1055 idt = 7*i; 1056 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1057 s6 = t[5+idt];s7 = t[6+idt]; 1058 while (nz--) { 1059 idx = 7*(*vi--); 1060 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1061 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1062 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1063 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1064 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1065 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1066 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1067 v -= 49; 1068 } 1069 } 1070 1071 /* copy t into x according to permutation */ 1072 ii = 0; 1073 for (i=0; i<n; i++) { 1074 ir = 7*r[i]; 1075 x[ir] = t[ii]; 1076 x[ir+1] = t[ii+1]; 1077 x[ir+2] = t[ii+2]; 1078 x[ir+3] = t[ii+3]; 1079 x[ir+4] = t[ii+4]; 1080 x[ir+5] = t[ii+5]; 1081 x[ir+6] = t[ii+6]; 1082 ii += 7; 1083 } 1084 1085 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1086 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1087 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1088 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1089 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1090 PetscFunctionReturn(0); 1091 } 1092 1093 /* ----------------------------------------------------------- */ 1094 #undef __FUNCT__ 1095 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1096 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1097 { 1098 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1099 IS iscol=a->col,isrow=a->row; 1100 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1101 int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 1102 MatScalar *aa=a->a,*v; 1103 PetscScalar *x,*b,*s,*t,*ls; 1104 1105 PetscFunctionBegin; 1106 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1107 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1108 t = a->solve_work; 1109 1110 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1111 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1112 1113 /* forward solve the lower triangular */ 1114 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1115 for (i=1; i<n; i++) { 1116 v = aa + bs2*ai[i]; 1117 vi = aj + ai[i]; 1118 nz = a->diag[i] - ai[i]; 1119 s = t + bs*i; 1120 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1121 while (nz--) { 1122 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1123 v += bs2; 1124 } 1125 } 1126 /* backward solve the upper triangular */ 1127 ls = a->solve_work + A->n; 1128 for (i=n-1; i>=0; i--){ 1129 v = aa + bs2*(a->diag[i] + 1); 1130 vi = aj + a->diag[i] + 1; 1131 nz = ai[i+1] - a->diag[i] - 1; 1132 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1133 while (nz--) { 1134 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1135 v += bs2; 1136 } 1137 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1138 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1139 } 1140 1141 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1142 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1143 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1144 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1145 PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 1146 PetscFunctionReturn(0); 1147 } 1148 1149 #undef __FUNCT__ 1150 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1151 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1152 { 1153 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1154 IS iscol=a->col,isrow=a->row; 1155 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1156 int *diag = a->diag; 1157 MatScalar *aa=a->a,*v; 1158 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1159 PetscScalar *x,*b,*t; 1160 1161 PetscFunctionBegin; 1162 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1163 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1164 t = a->solve_work; 1165 1166 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1167 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1168 1169 /* forward solve the lower triangular */ 1170 idx = 7*(*r++); 1171 t[0] = b[idx]; t[1] = b[1+idx]; 1172 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1173 t[5] = b[5+idx]; t[6] = b[6+idx]; 1174 1175 for (i=1; i<n; i++) { 1176 v = aa + 49*ai[i]; 1177 vi = aj + ai[i]; 1178 nz = diag[i] - ai[i]; 1179 idx = 7*(*r++); 1180 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1181 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1182 while (nz--) { 1183 idx = 7*(*vi++); 1184 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1185 x4 = t[3+idx];x5 = t[4+idx]; 1186 x6 = t[5+idx];x7 = t[6+idx]; 1187 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1188 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1189 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1190 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1191 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1192 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1193 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1194 v += 49; 1195 } 1196 idx = 7*i; 1197 t[idx] = s1;t[1+idx] = s2; 1198 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1199 t[5+idx] = s6;t[6+idx] = s7; 1200 } 1201 /* backward solve the upper triangular */ 1202 for (i=n-1; i>=0; i--){ 1203 v = aa + 49*diag[i] + 49; 1204 vi = aj + diag[i] + 1; 1205 nz = ai[i+1] - diag[i] - 1; 1206 idt = 7*i; 1207 s1 = t[idt]; s2 = t[1+idt]; 1208 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1209 s6 = t[5+idt];s7 = t[6+idt]; 1210 while (nz--) { 1211 idx = 7*(*vi++); 1212 x1 = t[idx]; x2 = t[1+idx]; 1213 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1214 x6 = t[5+idx]; x7 = t[6+idx]; 1215 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1216 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1217 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1218 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1219 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1220 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1221 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1222 v += 49; 1223 } 1224 idc = 7*(*c--); 1225 v = aa + 49*diag[i]; 1226 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1227 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1228 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1229 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1230 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1231 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1232 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1233 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1234 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1235 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1236 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1237 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1238 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1239 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1240 } 1241 1242 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1243 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1244 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1245 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1246 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1247 PetscFunctionReturn(0); 1248 } 1249 1250 #undef __FUNCT__ 1251 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1252 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1253 { 1254 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1255 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1256 int ierr,*diag = a->diag,jdx; 1257 MatScalar *aa=a->a,*v; 1258 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1259 1260 PetscFunctionBegin; 1261 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1262 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1263 /* forward solve the lower triangular */ 1264 idx = 0; 1265 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1266 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1267 x[6] = b[6+idx]; 1268 for (i=1; i<n; i++) { 1269 v = aa + 49*ai[i]; 1270 vi = aj + ai[i]; 1271 nz = diag[i] - ai[i]; 1272 idx = 7*i; 1273 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1274 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1275 s7 = b[6+idx]; 1276 while (nz--) { 1277 jdx = 7*(*vi++); 1278 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1279 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1280 x7 = x[6+jdx]; 1281 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1282 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1283 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1284 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1285 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1286 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1287 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1288 v += 49; 1289 } 1290 x[idx] = s1; 1291 x[1+idx] = s2; 1292 x[2+idx] = s3; 1293 x[3+idx] = s4; 1294 x[4+idx] = s5; 1295 x[5+idx] = s6; 1296 x[6+idx] = s7; 1297 } 1298 /* backward solve the upper triangular */ 1299 for (i=n-1; i>=0; i--){ 1300 v = aa + 49*diag[i] + 49; 1301 vi = aj + diag[i] + 1; 1302 nz = ai[i+1] - diag[i] - 1; 1303 idt = 7*i; 1304 s1 = x[idt]; s2 = x[1+idt]; 1305 s3 = x[2+idt]; s4 = x[3+idt]; 1306 s5 = x[4+idt]; s6 = x[5+idt]; 1307 s7 = x[6+idt]; 1308 while (nz--) { 1309 idx = 7*(*vi++); 1310 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1311 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1312 x7 = x[6+idx]; 1313 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1314 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1315 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1316 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1317 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1318 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1319 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1320 v += 49; 1321 } 1322 v = aa + 49*diag[i]; 1323 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1324 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1325 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1326 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1327 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1328 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1329 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1330 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1331 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1332 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1333 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1334 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1335 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1336 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1337 } 1338 1339 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1340 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1341 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1342 PetscFunctionReturn(0); 1343 } 1344 1345 #undef __FUNCT__ 1346 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1347 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1348 { 1349 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1350 IS iscol=a->col,isrow=a->row; 1351 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1352 int *diag = a->diag; 1353 MatScalar *aa=a->a,*v; 1354 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1355 1356 PetscFunctionBegin; 1357 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1358 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1359 t = a->solve_work; 1360 1361 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1362 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1363 1364 /* forward solve the lower triangular */ 1365 idx = 6*(*r++); 1366 t[0] = b[idx]; t[1] = b[1+idx]; 1367 t[2] = b[2+idx]; t[3] = b[3+idx]; 1368 t[4] = b[4+idx]; t[5] = b[5+idx]; 1369 for (i=1; i<n; i++) { 1370 v = aa + 36*ai[i]; 1371 vi = aj + ai[i]; 1372 nz = diag[i] - ai[i]; 1373 idx = 6*(*r++); 1374 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1375 s5 = b[4+idx]; s6 = b[5+idx]; 1376 while (nz--) { 1377 idx = 6*(*vi++); 1378 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1379 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1380 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1381 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1382 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1383 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1384 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1385 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1386 v += 36; 1387 } 1388 idx = 6*i; 1389 t[idx] = s1;t[1+idx] = s2; 1390 t[2+idx] = s3;t[3+idx] = s4; 1391 t[4+idx] = s5;t[5+idx] = s6; 1392 } 1393 /* backward solve the upper triangular */ 1394 for (i=n-1; i>=0; i--){ 1395 v = aa + 36*diag[i] + 36; 1396 vi = aj + diag[i] + 1; 1397 nz = ai[i+1] - diag[i] - 1; 1398 idt = 6*i; 1399 s1 = t[idt]; s2 = t[1+idt]; 1400 s3 = t[2+idt];s4 = t[3+idt]; 1401 s5 = t[4+idt];s6 = t[5+idt]; 1402 while (nz--) { 1403 idx = 6*(*vi++); 1404 x1 = t[idx]; x2 = t[1+idx]; 1405 x3 = t[2+idx]; x4 = t[3+idx]; 1406 x5 = t[4+idx]; x6 = t[5+idx]; 1407 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1408 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1409 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1410 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1411 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1412 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1413 v += 36; 1414 } 1415 idc = 6*(*c--); 1416 v = aa + 36*diag[i]; 1417 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1418 v[18]*s4+v[24]*s5+v[30]*s6; 1419 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1420 v[19]*s4+v[25]*s5+v[31]*s6; 1421 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1422 v[20]*s4+v[26]*s5+v[32]*s6; 1423 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1424 v[21]*s4+v[27]*s5+v[33]*s6; 1425 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1426 v[22]*s4+v[28]*s5+v[34]*s6; 1427 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1428 v[23]*s4+v[29]*s5+v[35]*s6; 1429 } 1430 1431 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1432 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1433 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1434 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1435 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1436 PetscFunctionReturn(0); 1437 } 1438 1439 #undef __FUNCT__ 1440 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1441 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1442 { 1443 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1444 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1445 int ierr,*diag = a->diag,jdx; 1446 MatScalar *aa=a->a,*v; 1447 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1448 1449 PetscFunctionBegin; 1450 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1451 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1452 /* forward solve the lower triangular */ 1453 idx = 0; 1454 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1455 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1456 for (i=1; i<n; i++) { 1457 v = aa + 36*ai[i]; 1458 vi = aj + ai[i]; 1459 nz = diag[i] - ai[i]; 1460 idx = 6*i; 1461 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1462 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1463 while (nz--) { 1464 jdx = 6*(*vi++); 1465 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1466 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1467 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1468 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1469 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1470 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1471 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1472 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1473 v += 36; 1474 } 1475 x[idx] = s1; 1476 x[1+idx] = s2; 1477 x[2+idx] = s3; 1478 x[3+idx] = s4; 1479 x[4+idx] = s5; 1480 x[5+idx] = s6; 1481 } 1482 /* backward solve the upper triangular */ 1483 for (i=n-1; i>=0; i--){ 1484 v = aa + 36*diag[i] + 36; 1485 vi = aj + diag[i] + 1; 1486 nz = ai[i+1] - diag[i] - 1; 1487 idt = 6*i; 1488 s1 = x[idt]; s2 = x[1+idt]; 1489 s3 = x[2+idt]; s4 = x[3+idt]; 1490 s5 = x[4+idt]; s6 = x[5+idt]; 1491 while (nz--) { 1492 idx = 6*(*vi++); 1493 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1494 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1495 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1496 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1497 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1498 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1499 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1500 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1501 v += 36; 1502 } 1503 v = aa + 36*diag[i]; 1504 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1505 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1506 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1507 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1508 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1509 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1510 } 1511 1512 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1513 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1514 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1515 PetscFunctionReturn(0); 1516 } 1517 1518 #undef __FUNCT__ 1519 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1520 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1521 { 1522 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1523 IS iscol=a->col,isrow=a->row; 1524 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1525 int *diag = a->diag; 1526 MatScalar *aa=a->a,*v; 1527 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1528 1529 PetscFunctionBegin; 1530 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1531 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1532 t = a->solve_work; 1533 1534 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1535 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1536 1537 /* forward solve the lower triangular */ 1538 idx = 5*(*r++); 1539 t[0] = b[idx]; t[1] = b[1+idx]; 1540 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1541 for (i=1; i<n; i++) { 1542 v = aa + 25*ai[i]; 1543 vi = aj + ai[i]; 1544 nz = diag[i] - ai[i]; 1545 idx = 5*(*r++); 1546 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1547 s5 = b[4+idx]; 1548 while (nz--) { 1549 idx = 5*(*vi++); 1550 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1551 x4 = t[3+idx];x5 = t[4+idx]; 1552 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1553 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1554 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1555 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1556 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1557 v += 25; 1558 } 1559 idx = 5*i; 1560 t[idx] = s1;t[1+idx] = s2; 1561 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1562 } 1563 /* backward solve the upper triangular */ 1564 for (i=n-1; i>=0; i--){ 1565 v = aa + 25*diag[i] + 25; 1566 vi = aj + diag[i] + 1; 1567 nz = ai[i+1] - diag[i] - 1; 1568 idt = 5*i; 1569 s1 = t[idt]; s2 = t[1+idt]; 1570 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1571 while (nz--) { 1572 idx = 5*(*vi++); 1573 x1 = t[idx]; x2 = t[1+idx]; 1574 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1575 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1576 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1577 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1578 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1579 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1580 v += 25; 1581 } 1582 idc = 5*(*c--); 1583 v = aa + 25*diag[i]; 1584 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1585 v[15]*s4+v[20]*s5; 1586 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1587 v[16]*s4+v[21]*s5; 1588 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1589 v[17]*s4+v[22]*s5; 1590 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1591 v[18]*s4+v[23]*s5; 1592 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1593 v[19]*s4+v[24]*s5; 1594 } 1595 1596 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1597 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1598 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1599 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1600 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1601 PetscFunctionReturn(0); 1602 } 1603 1604 #undef __FUNCT__ 1605 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1606 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1607 { 1608 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1609 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1610 int ierr,*diag = a->diag,jdx; 1611 MatScalar *aa=a->a,*v; 1612 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1613 1614 PetscFunctionBegin; 1615 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1616 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1617 /* forward solve the lower triangular */ 1618 idx = 0; 1619 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1620 for (i=1; i<n; i++) { 1621 v = aa + 25*ai[i]; 1622 vi = aj + ai[i]; 1623 nz = diag[i] - ai[i]; 1624 idx = 5*i; 1625 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1626 while (nz--) { 1627 jdx = 5*(*vi++); 1628 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1629 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1630 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1631 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1632 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1633 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1634 v += 25; 1635 } 1636 x[idx] = s1; 1637 x[1+idx] = s2; 1638 x[2+idx] = s3; 1639 x[3+idx] = s4; 1640 x[4+idx] = s5; 1641 } 1642 /* backward solve the upper triangular */ 1643 for (i=n-1; i>=0; i--){ 1644 v = aa + 25*diag[i] + 25; 1645 vi = aj + diag[i] + 1; 1646 nz = ai[i+1] - diag[i] - 1; 1647 idt = 5*i; 1648 s1 = x[idt]; s2 = x[1+idt]; 1649 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1650 while (nz--) { 1651 idx = 5*(*vi++); 1652 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1653 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1654 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1655 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1656 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1657 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1658 v += 25; 1659 } 1660 v = aa + 25*diag[i]; 1661 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1662 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1663 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1664 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1665 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1666 } 1667 1668 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1669 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1670 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1671 PetscFunctionReturn(0); 1672 } 1673 1674 #undef __FUNCT__ 1675 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1676 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1677 { 1678 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1679 IS iscol=a->col,isrow=a->row; 1680 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1681 int *diag = a->diag; 1682 MatScalar *aa=a->a,*v; 1683 PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1684 1685 PetscFunctionBegin; 1686 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1687 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1688 t = a->solve_work; 1689 1690 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1691 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1692 1693 /* forward solve the lower triangular */ 1694 idx = 4*(*r++); 1695 t[0] = b[idx]; t[1] = b[1+idx]; 1696 t[2] = b[2+idx]; t[3] = b[3+idx]; 1697 for (i=1; i<n; i++) { 1698 v = aa + 16*ai[i]; 1699 vi = aj + ai[i]; 1700 nz = diag[i] - ai[i]; 1701 idx = 4*(*r++); 1702 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1703 while (nz--) { 1704 idx = 4*(*vi++); 1705 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1706 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1707 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1708 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1709 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1710 v += 16; 1711 } 1712 idx = 4*i; 1713 t[idx] = s1;t[1+idx] = s2; 1714 t[2+idx] = s3;t[3+idx] = s4; 1715 } 1716 /* backward solve the upper triangular */ 1717 for (i=n-1; i>=0; i--){ 1718 v = aa + 16*diag[i] + 16; 1719 vi = aj + diag[i] + 1; 1720 nz = ai[i+1] - diag[i] - 1; 1721 idt = 4*i; 1722 s1 = t[idt]; s2 = t[1+idt]; 1723 s3 = t[2+idt];s4 = t[3+idt]; 1724 while (nz--) { 1725 idx = 4*(*vi++); 1726 x1 = t[idx]; x2 = t[1+idx]; 1727 x3 = t[2+idx]; x4 = t[3+idx]; 1728 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1729 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1730 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1731 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1732 v += 16; 1733 } 1734 idc = 4*(*c--); 1735 v = aa + 16*diag[i]; 1736 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1737 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1738 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1739 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1740 } 1741 1742 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1743 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1744 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1745 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1746 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1747 PetscFunctionReturn(0); 1748 } 1749 1750 #undef __FUNCT__ 1751 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1752 int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1753 { 1754 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1755 IS iscol=a->col,isrow=a->row; 1756 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1757 int *diag = a->diag; 1758 MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1759 PetscScalar *x,*b; 1760 1761 PetscFunctionBegin; 1762 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1763 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1764 t = (MatScalar *)a->solve_work; 1765 1766 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1767 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1768 1769 /* forward solve the lower triangular */ 1770 idx = 4*(*r++); 1771 t[0] = (MatScalar)b[idx]; 1772 t[1] = (MatScalar)b[1+idx]; 1773 t[2] = (MatScalar)b[2+idx]; 1774 t[3] = (MatScalar)b[3+idx]; 1775 for (i=1; i<n; i++) { 1776 v = aa + 16*ai[i]; 1777 vi = aj + ai[i]; 1778 nz = diag[i] - ai[i]; 1779 idx = 4*(*r++); 1780 s1 = (MatScalar)b[idx]; 1781 s2 = (MatScalar)b[1+idx]; 1782 s3 = (MatScalar)b[2+idx]; 1783 s4 = (MatScalar)b[3+idx]; 1784 while (nz--) { 1785 idx = 4*(*vi++); 1786 x1 = t[idx]; 1787 x2 = t[1+idx]; 1788 x3 = t[2+idx]; 1789 x4 = t[3+idx]; 1790 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1791 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1792 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1793 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1794 v += 16; 1795 } 1796 idx = 4*i; 1797 t[idx] = s1; 1798 t[1+idx] = s2; 1799 t[2+idx] = s3; 1800 t[3+idx] = s4; 1801 } 1802 /* backward solve the upper triangular */ 1803 for (i=n-1; i>=0; i--){ 1804 v = aa + 16*diag[i] + 16; 1805 vi = aj + diag[i] + 1; 1806 nz = ai[i+1] - diag[i] - 1; 1807 idt = 4*i; 1808 s1 = t[idt]; 1809 s2 = t[1+idt]; 1810 s3 = t[2+idt]; 1811 s4 = t[3+idt]; 1812 while (nz--) { 1813 idx = 4*(*vi++); 1814 x1 = t[idx]; 1815 x2 = t[1+idx]; 1816 x3 = t[2+idx]; 1817 x4 = t[3+idx]; 1818 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1819 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1820 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1821 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1822 v += 16; 1823 } 1824 idc = 4*(*c--); 1825 v = aa + 16*diag[i]; 1826 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1827 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1828 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1829 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1830 x[idc] = (PetscScalar)t[idt]; 1831 x[1+idc] = (PetscScalar)t[1+idt]; 1832 x[2+idc] = (PetscScalar)t[2+idt]; 1833 x[3+idc] = (PetscScalar)t[3+idt]; 1834 } 1835 1836 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1837 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1838 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1839 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1840 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1841 PetscFunctionReturn(0); 1842 } 1843 1844 #if defined (PETSC_HAVE_SSE) 1845 1846 #include PETSC_HAVE_SSE 1847 1848 #undef __FUNCT__ 1849 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1850 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1851 { 1852 /* 1853 Note: This code uses demotion of double 1854 to float when performing the mixed-mode computation. 1855 This may not be numerically reasonable for all applications. 1856 */ 1857 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1858 IS iscol=a->col,isrow=a->row; 1859 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1860 int *diag = a->diag,ai16; 1861 MatScalar *aa=a->a,*v; 1862 PetscScalar *x,*b,*t; 1863 1864 /* Make space in temp stack for 16 Byte Aligned arrays */ 1865 float ssealignedspace[11],*tmps,*tmpx; 1866 unsigned long offset; 1867 1868 PetscFunctionBegin; 1869 SSE_SCOPE_BEGIN; 1870 1871 offset = (unsigned long)ssealignedspace % 16; 1872 if (offset) offset = (16 - offset)/4; 1873 tmps = &ssealignedspace[offset]; 1874 tmpx = &ssealignedspace[offset+4]; 1875 PREFETCH_NTA(aa+16*ai[1]); 1876 1877 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1878 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1879 t = a->solve_work; 1880 1881 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1882 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1883 1884 /* forward solve the lower triangular */ 1885 idx = 4*(*r++); 1886 t[0] = b[idx]; t[1] = b[1+idx]; 1887 t[2] = b[2+idx]; t[3] = b[3+idx]; 1888 v = aa + 16*ai[1]; 1889 1890 for (i=1; i<n;) { 1891 PREFETCH_NTA(&v[8]); 1892 vi = aj + ai[i]; 1893 nz = diag[i] - ai[i]; 1894 idx = 4*(*r++); 1895 1896 /* Demote sum from double to float */ 1897 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1898 LOAD_PS(tmps,XMM7); 1899 1900 while (nz--) { 1901 PREFETCH_NTA(&v[16]); 1902 idx = 4*(*vi++); 1903 1904 /* Demote solution (so far) from double to float */ 1905 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1906 1907 /* 4x4 Matrix-Vector product with negative accumulation: */ 1908 SSE_INLINE_BEGIN_2(tmpx,v) 1909 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1910 1911 /* First Column */ 1912 SSE_COPY_PS(XMM0,XMM6) 1913 SSE_SHUFFLE(XMM0,XMM0,0x00) 1914 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1915 SSE_SUB_PS(XMM7,XMM0) 1916 1917 /* Second Column */ 1918 SSE_COPY_PS(XMM1,XMM6) 1919 SSE_SHUFFLE(XMM1,XMM1,0x55) 1920 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1921 SSE_SUB_PS(XMM7,XMM1) 1922 1923 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1924 1925 /* Third Column */ 1926 SSE_COPY_PS(XMM2,XMM6) 1927 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1928 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1929 SSE_SUB_PS(XMM7,XMM2) 1930 1931 /* Fourth Column */ 1932 SSE_COPY_PS(XMM3,XMM6) 1933 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1934 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1935 SSE_SUB_PS(XMM7,XMM3) 1936 SSE_INLINE_END_2 1937 1938 v += 16; 1939 } 1940 idx = 4*i; 1941 v = aa + 16*ai[++i]; 1942 PREFETCH_NTA(v); 1943 STORE_PS(tmps,XMM7); 1944 1945 /* Promote result from float to double */ 1946 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1947 } 1948 /* backward solve the upper triangular */ 1949 idt = 4*(n-1); 1950 ai16 = 16*diag[n-1]; 1951 v = aa + ai16 + 16; 1952 for (i=n-1; i>=0;){ 1953 PREFETCH_NTA(&v[8]); 1954 vi = aj + diag[i] + 1; 1955 nz = ai[i+1] - diag[i] - 1; 1956 1957 /* Demote accumulator from double to float */ 1958 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1959 LOAD_PS(tmps,XMM7); 1960 1961 while (nz--) { 1962 PREFETCH_NTA(&v[16]); 1963 idx = 4*(*vi++); 1964 1965 /* Demote solution (so far) from double to float */ 1966 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 1967 1968 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1969 SSE_INLINE_BEGIN_2(tmpx,v) 1970 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1971 1972 /* First Column */ 1973 SSE_COPY_PS(XMM0,XMM6) 1974 SSE_SHUFFLE(XMM0,XMM0,0x00) 1975 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1976 SSE_SUB_PS(XMM7,XMM0) 1977 1978 /* Second Column */ 1979 SSE_COPY_PS(XMM1,XMM6) 1980 SSE_SHUFFLE(XMM1,XMM1,0x55) 1981 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1982 SSE_SUB_PS(XMM7,XMM1) 1983 1984 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1985 1986 /* Third Column */ 1987 SSE_COPY_PS(XMM2,XMM6) 1988 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1989 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1990 SSE_SUB_PS(XMM7,XMM2) 1991 1992 /* Fourth Column */ 1993 SSE_COPY_PS(XMM3,XMM6) 1994 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1995 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1996 SSE_SUB_PS(XMM7,XMM3) 1997 SSE_INLINE_END_2 1998 v += 16; 1999 } 2000 v = aa + ai16; 2001 ai16 = 16*diag[--i]; 2002 PREFETCH_NTA(aa+ai16+16); 2003 /* 2004 Scale the result by the diagonal 4x4 block, 2005 which was inverted as part of the factorization 2006 */ 2007 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2008 /* First Column */ 2009 SSE_COPY_PS(XMM0,XMM7) 2010 SSE_SHUFFLE(XMM0,XMM0,0x00) 2011 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2012 2013 /* Second Column */ 2014 SSE_COPY_PS(XMM1,XMM7) 2015 SSE_SHUFFLE(XMM1,XMM1,0x55) 2016 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2017 SSE_ADD_PS(XMM0,XMM1) 2018 2019 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2020 2021 /* Third Column */ 2022 SSE_COPY_PS(XMM2,XMM7) 2023 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2024 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2025 SSE_ADD_PS(XMM0,XMM2) 2026 2027 /* Fourth Column */ 2028 SSE_COPY_PS(XMM3,XMM7) 2029 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2030 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2031 SSE_ADD_PS(XMM0,XMM3) 2032 2033 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2034 SSE_INLINE_END_3 2035 2036 /* Promote solution from float to double */ 2037 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2038 2039 /* Apply reordering to t and stream into x. */ 2040 /* This way, x doesn't pollute the cache. */ 2041 /* Be careful with size: 2 doubles = 4 floats! */ 2042 idc = 4*(*c--); 2043 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2044 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2045 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2046 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2047 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2048 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2049 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2050 SSE_INLINE_END_2 2051 v = aa + ai16 + 16; 2052 idt -= 4; 2053 } 2054 2055 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2056 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2057 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2058 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2059 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2060 SSE_SCOPE_END; 2061 PetscFunctionReturn(0); 2062 } 2063 2064 #endif 2065 2066 2067 /* 2068 Special case where the matrix was ILU(0) factored in the natural 2069 ordering. This eliminates the need for the column and row permutation. 2070 */ 2071 #undef __FUNCT__ 2072 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2073 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2074 { 2075 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2076 int n=a->mbs,*ai=a->i,*aj=a->j; 2077 int ierr,*diag = a->diag; 2078 MatScalar *aa=a->a; 2079 PetscScalar *x,*b; 2080 2081 PetscFunctionBegin; 2082 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2083 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2084 2085 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2086 { 2087 static PetscScalar w[2000]; /* very BAD need to fix */ 2088 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2089 } 2090 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2091 { 2092 static PetscScalar w[2000]; /* very BAD need to fix */ 2093 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2094 } 2095 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2096 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2097 #else 2098 { 2099 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2100 MatScalar *v; 2101 int jdx,idt,idx,nz,*vi,i,ai16; 2102 2103 /* forward solve the lower triangular */ 2104 idx = 0; 2105 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2106 for (i=1; i<n; i++) { 2107 v = aa + 16*ai[i]; 2108 vi = aj + ai[i]; 2109 nz = diag[i] - ai[i]; 2110 idx += 4; 2111 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2112 while (nz--) { 2113 jdx = 4*(*vi++); 2114 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2115 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2116 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2117 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2118 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2119 v += 16; 2120 } 2121 x[idx] = s1; 2122 x[1+idx] = s2; 2123 x[2+idx] = s3; 2124 x[3+idx] = s4; 2125 } 2126 /* backward solve the upper triangular */ 2127 idt = 4*(n-1); 2128 for (i=n-1; i>=0; i--){ 2129 ai16 = 16*diag[i]; 2130 v = aa + ai16 + 16; 2131 vi = aj + diag[i] + 1; 2132 nz = ai[i+1] - diag[i] - 1; 2133 s1 = x[idt]; s2 = x[1+idt]; 2134 s3 = x[2+idt];s4 = x[3+idt]; 2135 while (nz--) { 2136 idx = 4*(*vi++); 2137 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2138 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2139 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2140 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2141 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2142 v += 16; 2143 } 2144 v = aa + ai16; 2145 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2146 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2147 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2148 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2149 idt -= 4; 2150 } 2151 } 2152 #endif 2153 2154 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2155 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2156 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2157 PetscFunctionReturn(0); 2158 } 2159 2160 #undef __FUNCT__ 2161 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2162 int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2163 { 2164 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2165 int n=a->mbs,*ai=a->i,*aj=a->j; 2166 int ierr,*diag = a->diag; 2167 MatScalar *aa=a->a; 2168 PetscScalar *x,*b; 2169 2170 PetscFunctionBegin; 2171 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2172 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2173 2174 { 2175 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2176 MatScalar *v,*t=(MatScalar *)x; 2177 int jdx,idt,idx,nz,*vi,i,ai16; 2178 2179 /* forward solve the lower triangular */ 2180 idx = 0; 2181 t[0] = (MatScalar)b[0]; 2182 t[1] = (MatScalar)b[1]; 2183 t[2] = (MatScalar)b[2]; 2184 t[3] = (MatScalar)b[3]; 2185 for (i=1; i<n; i++) { 2186 v = aa + 16*ai[i]; 2187 vi = aj + ai[i]; 2188 nz = diag[i] - ai[i]; 2189 idx += 4; 2190 s1 = (MatScalar)b[idx]; 2191 s2 = (MatScalar)b[1+idx]; 2192 s3 = (MatScalar)b[2+idx]; 2193 s4 = (MatScalar)b[3+idx]; 2194 while (nz--) { 2195 jdx = 4*(*vi++); 2196 x1 = t[jdx]; 2197 x2 = t[1+jdx]; 2198 x3 = t[2+jdx]; 2199 x4 = t[3+jdx]; 2200 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2201 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2202 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2203 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2204 v += 16; 2205 } 2206 t[idx] = s1; 2207 t[1+idx] = s2; 2208 t[2+idx] = s3; 2209 t[3+idx] = s4; 2210 } 2211 /* backward solve the upper triangular */ 2212 idt = 4*(n-1); 2213 for (i=n-1; i>=0; i--){ 2214 ai16 = 16*diag[i]; 2215 v = aa + ai16 + 16; 2216 vi = aj + diag[i] + 1; 2217 nz = ai[i+1] - diag[i] - 1; 2218 s1 = t[idt]; 2219 s2 = t[1+idt]; 2220 s3 = t[2+idt]; 2221 s4 = t[3+idt]; 2222 while (nz--) { 2223 idx = 4*(*vi++); 2224 x1 = (MatScalar)x[idx]; 2225 x2 = (MatScalar)x[1+idx]; 2226 x3 = (MatScalar)x[2+idx]; 2227 x4 = (MatScalar)x[3+idx]; 2228 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2229 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2230 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2231 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2232 v += 16; 2233 } 2234 v = aa + ai16; 2235 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2236 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2237 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2238 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2239 idt -= 4; 2240 } 2241 } 2242 2243 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2244 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2245 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2246 PetscFunctionReturn(0); 2247 } 2248 2249 #if defined (PETSC_HAVE_SSE) 2250 2251 #include PETSC_HAVE_SSE 2252 #undef __FUNCT__ 2253 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj" 2254 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx) 2255 { 2256 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2257 unsigned short *aj=(unsigned short *)a->j; 2258 int ierr,*ai=a->i,n=a->mbs,*diag = a->diag; 2259 MatScalar *aa=a->a; 2260 PetscScalar *x,*b; 2261 2262 PetscFunctionBegin; 2263 SSE_SCOPE_BEGIN; 2264 /* 2265 Note: This code currently uses demotion of double 2266 to float when performing the mixed-mode computation. 2267 This may not be numerically reasonable for all applications. 2268 */ 2269 PREFETCH_NTA(aa+16*ai[1]); 2270 2271 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2272 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2273 { 2274 /* x will first be computed in single precision then promoted inplace to double */ 2275 MatScalar *v,*t=(MatScalar *)x; 2276 int nz,i,idt,ai16; 2277 unsigned int jdx,idx; 2278 unsigned short *vi; 2279 /* Forward solve the lower triangular factor. */ 2280 2281 /* First block is the identity. */ 2282 idx = 0; 2283 CONVERT_DOUBLE4_FLOAT4(t,b); 2284 v = aa + 16*((unsigned int)ai[1]); 2285 2286 for (i=1; i<n;) { 2287 PREFETCH_NTA(&v[8]); 2288 vi = aj + ai[i]; 2289 nz = diag[i] - ai[i]; 2290 idx += 4; 2291 2292 /* Demote RHS from double to float. */ 2293 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2294 LOAD_PS(&t[idx],XMM7); 2295 2296 while (nz--) { 2297 PREFETCH_NTA(&v[16]); 2298 jdx = 4*((unsigned int)(*vi++)); 2299 2300 /* 4x4 Matrix-Vector product with negative accumulation: */ 2301 SSE_INLINE_BEGIN_2(&t[jdx],v) 2302 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2303 2304 /* First Column */ 2305 SSE_COPY_PS(XMM0,XMM6) 2306 SSE_SHUFFLE(XMM0,XMM0,0x00) 2307 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2308 SSE_SUB_PS(XMM7,XMM0) 2309 2310 /* Second Column */ 2311 SSE_COPY_PS(XMM1,XMM6) 2312 SSE_SHUFFLE(XMM1,XMM1,0x55) 2313 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2314 SSE_SUB_PS(XMM7,XMM1) 2315 2316 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2317 2318 /* Third Column */ 2319 SSE_COPY_PS(XMM2,XMM6) 2320 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2321 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2322 SSE_SUB_PS(XMM7,XMM2) 2323 2324 /* Fourth Column */ 2325 SSE_COPY_PS(XMM3,XMM6) 2326 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2327 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2328 SSE_SUB_PS(XMM7,XMM3) 2329 SSE_INLINE_END_2 2330 2331 v += 16; 2332 } 2333 v = aa + 16*ai[++i]; 2334 PREFETCH_NTA(v); 2335 STORE_PS(&t[idx],XMM7); 2336 } 2337 2338 /* Backward solve the upper triangular factor.*/ 2339 2340 idt = 4*(n-1); 2341 ai16 = 16*diag[n-1]; 2342 v = aa + ai16 + 16; 2343 for (i=n-1; i>=0;){ 2344 PREFETCH_NTA(&v[8]); 2345 vi = aj + diag[i] + 1; 2346 nz = ai[i+1] - diag[i] - 1; 2347 2348 LOAD_PS(&t[idt],XMM7); 2349 2350 while (nz--) { 2351 PREFETCH_NTA(&v[16]); 2352 idx = 4*((unsigned int)(*vi++)); 2353 2354 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2355 SSE_INLINE_BEGIN_2(&t[idx],v) 2356 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2357 2358 /* First Column */ 2359 SSE_COPY_PS(XMM0,XMM6) 2360 SSE_SHUFFLE(XMM0,XMM0,0x00) 2361 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2362 SSE_SUB_PS(XMM7,XMM0) 2363 2364 /* Second Column */ 2365 SSE_COPY_PS(XMM1,XMM6) 2366 SSE_SHUFFLE(XMM1,XMM1,0x55) 2367 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2368 SSE_SUB_PS(XMM7,XMM1) 2369 2370 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2371 2372 /* Third Column */ 2373 SSE_COPY_PS(XMM2,XMM6) 2374 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2375 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2376 SSE_SUB_PS(XMM7,XMM2) 2377 2378 /* Fourth Column */ 2379 SSE_COPY_PS(XMM3,XMM6) 2380 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2381 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2382 SSE_SUB_PS(XMM7,XMM3) 2383 SSE_INLINE_END_2 2384 v += 16; 2385 } 2386 v = aa + ai16; 2387 ai16 = 16*diag[--i]; 2388 PREFETCH_NTA(aa+ai16+16); 2389 /* 2390 Scale the result by the diagonal 4x4 block, 2391 which was inverted as part of the factorization 2392 */ 2393 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2394 /* First Column */ 2395 SSE_COPY_PS(XMM0,XMM7) 2396 SSE_SHUFFLE(XMM0,XMM0,0x00) 2397 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2398 2399 /* Second Column */ 2400 SSE_COPY_PS(XMM1,XMM7) 2401 SSE_SHUFFLE(XMM1,XMM1,0x55) 2402 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2403 SSE_ADD_PS(XMM0,XMM1) 2404 2405 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2406 2407 /* Third Column */ 2408 SSE_COPY_PS(XMM2,XMM7) 2409 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2410 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2411 SSE_ADD_PS(XMM0,XMM2) 2412 2413 /* Fourth Column */ 2414 SSE_COPY_PS(XMM3,XMM7) 2415 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2416 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2417 SSE_ADD_PS(XMM0,XMM3) 2418 2419 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2420 SSE_INLINE_END_3 2421 2422 v = aa + ai16 + 16; 2423 idt -= 4; 2424 } 2425 2426 /* Convert t from single precision back to double precision (inplace)*/ 2427 idt = 4*(n-1); 2428 for (i=n-1;i>=0;i--) { 2429 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2430 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2431 PetscScalar *xtemp=&x[idt]; 2432 MatScalar *ttemp=&t[idt]; 2433 xtemp[3] = (PetscScalar)ttemp[3]; 2434 xtemp[2] = (PetscScalar)ttemp[2]; 2435 xtemp[1] = (PetscScalar)ttemp[1]; 2436 xtemp[0] = (PetscScalar)ttemp[0]; 2437 idt -= 4; 2438 } 2439 2440 } /* End of artificial scope. */ 2441 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2442 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2443 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2444 SSE_SCOPE_END; 2445 PetscFunctionReturn(0); 2446 } 2447 2448 #undef __FUNCT__ 2449 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2450 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2451 { 2452 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2453 int *aj=a->j; 2454 int ierr,*ai=a->i,n=a->mbs,*diag = a->diag; 2455 MatScalar *aa=a->a; 2456 PetscScalar *x,*b; 2457 2458 PetscFunctionBegin; 2459 SSE_SCOPE_BEGIN; 2460 /* 2461 Note: This code currently uses demotion of double 2462 to float when performing the mixed-mode computation. 2463 This may not be numerically reasonable for all applications. 2464 */ 2465 PREFETCH_NTA(aa+16*ai[1]); 2466 2467 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2468 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2469 { 2470 /* x will first be computed in single precision then promoted inplace to double */ 2471 MatScalar *v,*t=(MatScalar *)x; 2472 int nz,i,idt,ai16; 2473 int jdx,idx; 2474 int *vi; 2475 /* Forward solve the lower triangular factor. */ 2476 2477 /* First block is the identity. */ 2478 idx = 0; 2479 CONVERT_DOUBLE4_FLOAT4(t,b); 2480 v = aa + 16*ai[1]; 2481 2482 for (i=1; i<n;) { 2483 PREFETCH_NTA(&v[8]); 2484 vi = aj + ai[i]; 2485 nz = diag[i] - ai[i]; 2486 idx += 4; 2487 2488 /* Demote RHS from double to float. */ 2489 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2490 LOAD_PS(&t[idx],XMM7); 2491 2492 while (nz--) { 2493 PREFETCH_NTA(&v[16]); 2494 jdx = 4*(*vi++); 2495 /* jdx = *vi++; */ 2496 2497 /* 4x4 Matrix-Vector product with negative accumulation: */ 2498 SSE_INLINE_BEGIN_2(&t[jdx],v) 2499 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2500 2501 /* First Column */ 2502 SSE_COPY_PS(XMM0,XMM6) 2503 SSE_SHUFFLE(XMM0,XMM0,0x00) 2504 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2505 SSE_SUB_PS(XMM7,XMM0) 2506 2507 /* Second Column */ 2508 SSE_COPY_PS(XMM1,XMM6) 2509 SSE_SHUFFLE(XMM1,XMM1,0x55) 2510 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2511 SSE_SUB_PS(XMM7,XMM1) 2512 2513 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2514 2515 /* Third Column */ 2516 SSE_COPY_PS(XMM2,XMM6) 2517 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2518 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2519 SSE_SUB_PS(XMM7,XMM2) 2520 2521 /* Fourth Column */ 2522 SSE_COPY_PS(XMM3,XMM6) 2523 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2524 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2525 SSE_SUB_PS(XMM7,XMM3) 2526 SSE_INLINE_END_2 2527 2528 v += 16; 2529 } 2530 v = aa + 16*ai[++i]; 2531 PREFETCH_NTA(v); 2532 STORE_PS(&t[idx],XMM7); 2533 } 2534 2535 /* Backward solve the upper triangular factor.*/ 2536 2537 idt = 4*(n-1); 2538 ai16 = 16*diag[n-1]; 2539 v = aa + ai16 + 16; 2540 for (i=n-1; i>=0;){ 2541 PREFETCH_NTA(&v[8]); 2542 vi = aj + diag[i] + 1; 2543 nz = ai[i+1] - diag[i] - 1; 2544 2545 LOAD_PS(&t[idt],XMM7); 2546 2547 while (nz--) { 2548 PREFETCH_NTA(&v[16]); 2549 idx = 4*(*vi++); 2550 /* idx = *vi++; */ 2551 2552 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2553 SSE_INLINE_BEGIN_2(&t[idx],v) 2554 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2555 2556 /* First Column */ 2557 SSE_COPY_PS(XMM0,XMM6) 2558 SSE_SHUFFLE(XMM0,XMM0,0x00) 2559 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2560 SSE_SUB_PS(XMM7,XMM0) 2561 2562 /* Second Column */ 2563 SSE_COPY_PS(XMM1,XMM6) 2564 SSE_SHUFFLE(XMM1,XMM1,0x55) 2565 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2566 SSE_SUB_PS(XMM7,XMM1) 2567 2568 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2569 2570 /* Third Column */ 2571 SSE_COPY_PS(XMM2,XMM6) 2572 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2573 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2574 SSE_SUB_PS(XMM7,XMM2) 2575 2576 /* Fourth Column */ 2577 SSE_COPY_PS(XMM3,XMM6) 2578 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2579 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2580 SSE_SUB_PS(XMM7,XMM3) 2581 SSE_INLINE_END_2 2582 v += 16; 2583 } 2584 v = aa + ai16; 2585 ai16 = 16*diag[--i]; 2586 PREFETCH_NTA(aa+ai16+16); 2587 /* 2588 Scale the result by the diagonal 4x4 block, 2589 which was inverted as part of the factorization 2590 */ 2591 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2592 /* First Column */ 2593 SSE_COPY_PS(XMM0,XMM7) 2594 SSE_SHUFFLE(XMM0,XMM0,0x00) 2595 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2596 2597 /* Second Column */ 2598 SSE_COPY_PS(XMM1,XMM7) 2599 SSE_SHUFFLE(XMM1,XMM1,0x55) 2600 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2601 SSE_ADD_PS(XMM0,XMM1) 2602 2603 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2604 2605 /* Third Column */ 2606 SSE_COPY_PS(XMM2,XMM7) 2607 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2608 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2609 SSE_ADD_PS(XMM0,XMM2) 2610 2611 /* Fourth Column */ 2612 SSE_COPY_PS(XMM3,XMM7) 2613 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2614 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2615 SSE_ADD_PS(XMM0,XMM3) 2616 2617 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2618 SSE_INLINE_END_3 2619 2620 v = aa + ai16 + 16; 2621 idt -= 4; 2622 } 2623 2624 /* Convert t from single precision back to double precision (inplace)*/ 2625 idt = 4*(n-1); 2626 for (i=n-1;i>=0;i--) { 2627 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2628 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2629 PetscScalar *xtemp=&x[idt]; 2630 MatScalar *ttemp=&t[idt]; 2631 xtemp[3] = (PetscScalar)ttemp[3]; 2632 xtemp[2] = (PetscScalar)ttemp[2]; 2633 xtemp[1] = (PetscScalar)ttemp[1]; 2634 xtemp[0] = (PetscScalar)ttemp[0]; 2635 idt -= 4; 2636 } 2637 2638 } /* End of artificial scope. */ 2639 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2640 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2641 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2642 SSE_SCOPE_END; 2643 PetscFunctionReturn(0); 2644 } 2645 2646 #endif 2647 #undef __FUNCT__ 2648 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2649 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2650 { 2651 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2652 IS iscol=a->col,isrow=a->row; 2653 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2654 int *diag = a->diag; 2655 MatScalar *aa=a->a,*v; 2656 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 2657 2658 PetscFunctionBegin; 2659 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2660 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2661 t = a->solve_work; 2662 2663 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2664 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2665 2666 /* forward solve the lower triangular */ 2667 idx = 3*(*r++); 2668 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2669 for (i=1; i<n; i++) { 2670 v = aa + 9*ai[i]; 2671 vi = aj + ai[i]; 2672 nz = diag[i] - ai[i]; 2673 idx = 3*(*r++); 2674 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2675 while (nz--) { 2676 idx = 3*(*vi++); 2677 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2678 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2679 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2680 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2681 v += 9; 2682 } 2683 idx = 3*i; 2684 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2685 } 2686 /* backward solve the upper triangular */ 2687 for (i=n-1; i>=0; i--){ 2688 v = aa + 9*diag[i] + 9; 2689 vi = aj + diag[i] + 1; 2690 nz = ai[i+1] - diag[i] - 1; 2691 idt = 3*i; 2692 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2693 while (nz--) { 2694 idx = 3*(*vi++); 2695 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2696 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2697 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2698 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2699 v += 9; 2700 } 2701 idc = 3*(*c--); 2702 v = aa + 9*diag[i]; 2703 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2704 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2705 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2706 } 2707 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2708 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2709 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2710 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2711 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2712 PetscFunctionReturn(0); 2713 } 2714 2715 /* 2716 Special case where the matrix was ILU(0) factored in the natural 2717 ordering. This eliminates the need for the column and row permutation. 2718 */ 2719 #undef __FUNCT__ 2720 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2721 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2722 { 2723 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2724 int n=a->mbs,*ai=a->i,*aj=a->j; 2725 int ierr,*diag = a->diag; 2726 MatScalar *aa=a->a,*v; 2727 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 2728 int jdx,idt,idx,nz,*vi,i; 2729 2730 PetscFunctionBegin; 2731 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2732 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2733 2734 2735 /* forward solve the lower triangular */ 2736 idx = 0; 2737 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2738 for (i=1; i<n; i++) { 2739 v = aa + 9*ai[i]; 2740 vi = aj + ai[i]; 2741 nz = diag[i] - ai[i]; 2742 idx += 3; 2743 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2744 while (nz--) { 2745 jdx = 3*(*vi++); 2746 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2747 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2748 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2749 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2750 v += 9; 2751 } 2752 x[idx] = s1; 2753 x[1+idx] = s2; 2754 x[2+idx] = s3; 2755 } 2756 /* backward solve the upper triangular */ 2757 for (i=n-1; i>=0; i--){ 2758 v = aa + 9*diag[i] + 9; 2759 vi = aj + diag[i] + 1; 2760 nz = ai[i+1] - diag[i] - 1; 2761 idt = 3*i; 2762 s1 = x[idt]; s2 = x[1+idt]; 2763 s3 = x[2+idt]; 2764 while (nz--) { 2765 idx = 3*(*vi++); 2766 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2767 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2768 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2769 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2770 v += 9; 2771 } 2772 v = aa + 9*diag[i]; 2773 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2774 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2775 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2776 } 2777 2778 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2779 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2780 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2781 PetscFunctionReturn(0); 2782 } 2783 2784 #undef __FUNCT__ 2785 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2786 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2787 { 2788 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2789 IS iscol=a->col,isrow=a->row; 2790 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2791 int *diag = a->diag; 2792 MatScalar *aa=a->a,*v; 2793 PetscScalar *x,*b,s1,s2,x1,x2,*t; 2794 2795 PetscFunctionBegin; 2796 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2797 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2798 t = a->solve_work; 2799 2800 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2801 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2802 2803 /* forward solve the lower triangular */ 2804 idx = 2*(*r++); 2805 t[0] = b[idx]; t[1] = b[1+idx]; 2806 for (i=1; i<n; i++) { 2807 v = aa + 4*ai[i]; 2808 vi = aj + ai[i]; 2809 nz = diag[i] - ai[i]; 2810 idx = 2*(*r++); 2811 s1 = b[idx]; s2 = b[1+idx]; 2812 while (nz--) { 2813 idx = 2*(*vi++); 2814 x1 = t[idx]; x2 = t[1+idx]; 2815 s1 -= v[0]*x1 + v[2]*x2; 2816 s2 -= v[1]*x1 + v[3]*x2; 2817 v += 4; 2818 } 2819 idx = 2*i; 2820 t[idx] = s1; t[1+idx] = s2; 2821 } 2822 /* backward solve the upper triangular */ 2823 for (i=n-1; i>=0; i--){ 2824 v = aa + 4*diag[i] + 4; 2825 vi = aj + diag[i] + 1; 2826 nz = ai[i+1] - diag[i] - 1; 2827 idt = 2*i; 2828 s1 = t[idt]; s2 = t[1+idt]; 2829 while (nz--) { 2830 idx = 2*(*vi++); 2831 x1 = t[idx]; x2 = t[1+idx]; 2832 s1 -= v[0]*x1 + v[2]*x2; 2833 s2 -= v[1]*x1 + v[3]*x2; 2834 v += 4; 2835 } 2836 idc = 2*(*c--); 2837 v = aa + 4*diag[i]; 2838 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2839 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2840 } 2841 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2842 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2843 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2844 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2845 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2846 PetscFunctionReturn(0); 2847 } 2848 2849 /* 2850 Special case where the matrix was ILU(0) factored in the natural 2851 ordering. This eliminates the need for the column and row permutation. 2852 */ 2853 #undef __FUNCT__ 2854 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2855 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2856 { 2857 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2858 int n=a->mbs,*ai=a->i,*aj=a->j; 2859 int ierr,*diag = a->diag; 2860 MatScalar *aa=a->a,*v; 2861 PetscScalar *x,*b,s1,s2,x1,x2; 2862 int jdx,idt,idx,nz,*vi,i; 2863 2864 PetscFunctionBegin; 2865 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2866 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2867 2868 /* forward solve the lower triangular */ 2869 idx = 0; 2870 x[0] = b[0]; x[1] = b[1]; 2871 for (i=1; i<n; i++) { 2872 v = aa + 4*ai[i]; 2873 vi = aj + ai[i]; 2874 nz = diag[i] - ai[i]; 2875 idx += 2; 2876 s1 = b[idx];s2 = b[1+idx]; 2877 while (nz--) { 2878 jdx = 2*(*vi++); 2879 x1 = x[jdx];x2 = x[1+jdx]; 2880 s1 -= v[0]*x1 + v[2]*x2; 2881 s2 -= v[1]*x1 + v[3]*x2; 2882 v += 4; 2883 } 2884 x[idx] = s1; 2885 x[1+idx] = s2; 2886 } 2887 /* backward solve the upper triangular */ 2888 for (i=n-1; i>=0; i--){ 2889 v = aa + 4*diag[i] + 4; 2890 vi = aj + diag[i] + 1; 2891 nz = ai[i+1] - diag[i] - 1; 2892 idt = 2*i; 2893 s1 = x[idt]; s2 = x[1+idt]; 2894 while (nz--) { 2895 idx = 2*(*vi++); 2896 x1 = x[idx]; x2 = x[1+idx]; 2897 s1 -= v[0]*x1 + v[2]*x2; 2898 s2 -= v[1]*x1 + v[3]*x2; 2899 v += 4; 2900 } 2901 v = aa + 4*diag[i]; 2902 x[idt] = v[0]*s1 + v[2]*s2; 2903 x[1+idt] = v[1]*s1 + v[3]*s2; 2904 } 2905 2906 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2907 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2908 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2909 PetscFunctionReturn(0); 2910 } 2911 2912 #undef __FUNCT__ 2913 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2914 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2915 { 2916 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2917 IS iscol=a->col,isrow=a->row; 2918 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2919 int *diag = a->diag; 2920 MatScalar *aa=a->a,*v; 2921 PetscScalar *x,*b,s1,*t; 2922 2923 PetscFunctionBegin; 2924 if (!n) PetscFunctionReturn(0); 2925 2926 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2927 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2928 t = a->solve_work; 2929 2930 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2931 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2932 2933 /* forward solve the lower triangular */ 2934 t[0] = b[*r++]; 2935 for (i=1; i<n; i++) { 2936 v = aa + ai[i]; 2937 vi = aj + ai[i]; 2938 nz = diag[i] - ai[i]; 2939 s1 = b[*r++]; 2940 while (nz--) { 2941 s1 -= (*v++)*t[*vi++]; 2942 } 2943 t[i] = s1; 2944 } 2945 /* backward solve the upper triangular */ 2946 for (i=n-1; i>=0; i--){ 2947 v = aa + diag[i] + 1; 2948 vi = aj + diag[i] + 1; 2949 nz = ai[i+1] - diag[i] - 1; 2950 s1 = t[i]; 2951 while (nz--) { 2952 s1 -= (*v++)*t[*vi++]; 2953 } 2954 x[*c--] = t[i] = aa[diag[i]]*s1; 2955 } 2956 2957 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2958 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2959 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2960 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2961 PetscLogFlops(2*1*(a->nz) - A->n); 2962 PetscFunctionReturn(0); 2963 } 2964 /* 2965 Special case where the matrix was ILU(0) factored in the natural 2966 ordering. This eliminates the need for the column and row permutation. 2967 */ 2968 #undef __FUNCT__ 2969 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 2970 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2971 { 2972 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2973 int n=a->mbs,*ai=a->i,*aj=a->j; 2974 int ierr,*diag = a->diag; 2975 MatScalar *aa=a->a; 2976 PetscScalar *x,*b; 2977 PetscScalar s1,x1; 2978 MatScalar *v; 2979 int jdx,idt,idx,nz,*vi,i; 2980 2981 PetscFunctionBegin; 2982 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2983 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2984 2985 /* forward solve the lower triangular */ 2986 idx = 0; 2987 x[0] = b[0]; 2988 for (i=1; i<n; i++) { 2989 v = aa + ai[i]; 2990 vi = aj + ai[i]; 2991 nz = diag[i] - ai[i]; 2992 idx += 1; 2993 s1 = b[idx]; 2994 while (nz--) { 2995 jdx = *vi++; 2996 x1 = x[jdx]; 2997 s1 -= v[0]*x1; 2998 v += 1; 2999 } 3000 x[idx] = s1; 3001 } 3002 /* backward solve the upper triangular */ 3003 for (i=n-1; i>=0; i--){ 3004 v = aa + diag[i] + 1; 3005 vi = aj + diag[i] + 1; 3006 nz = ai[i+1] - diag[i] - 1; 3007 idt = i; 3008 s1 = x[idt]; 3009 while (nz--) { 3010 idx = *vi++; 3011 x1 = x[idx]; 3012 s1 -= v[0]*x1; 3013 v += 1; 3014 } 3015 v = aa + diag[i]; 3016 x[idt] = v[0]*s1; 3017 } 3018 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 3019 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 3020 PetscLogFlops(2*(a->nz) - A->n); 3021 PetscFunctionReturn(0); 3022 } 3023 3024 /* ----------------------------------------------------------------*/ 3025 /* 3026 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 3027 except that the data structure of Mat_SeqAIJ is slightly different. 3028 Not a good example of code reuse. 3029 */ 3030 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat); 3031 3032 #undef __FUNCT__ 3033 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 3034 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact) 3035 { 3036 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 3037 IS isicol; 3038 int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j; 3039 int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 3040 int *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0; 3041 int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 3042 PetscTruth col_identity,row_identity; 3043 PetscReal f; 3044 3045 PetscFunctionBegin; 3046 f = info->fill; 3047 levels = (int)info->levels; 3048 diagonal_fill = (int)info->diagonal_fill; 3049 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 3050 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 3051 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 3052 3053 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 3054 ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 3055 (*fact)->factor = FACTOR_LU; 3056 b = (Mat_SeqBAIJ*)(*fact)->data; 3057 if (!b->diag) { 3058 ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3059 } 3060 ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 3061 b->row = isrow; 3062 b->col = iscol; 3063 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3064 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3065 b->icol = isicol; 3066 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3067 ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3068 } else { /* general case perform the symbolic factorization */ 3069 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 3070 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 3071 3072 /* get new row pointers */ 3073 ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 3074 ainew[0] = 0; 3075 /* don't know how many column pointers are needed so estimate */ 3076 jmax = (int)(f*ai[n] + 1); 3077 ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 3078 /* ajfill is level of fill for each fill entry */ 3079 ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 3080 /* fill is a linked list of nonzeros in active row */ 3081 ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 3082 /* im is level for each filled value */ 3083 ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 3084 /* dloc is location of diagonal in factor */ 3085 ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 3086 dloc[0] = 0; 3087 for (prow=0; prow<n; prow++) { 3088 3089 /* copy prow into linked list */ 3090 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 3091 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 3092 xi = aj + ai[r[prow]]; 3093 fill[n] = n; 3094 fill[prow] = -1; /* marker for diagonal entry */ 3095 while (nz--) { 3096 fm = n; 3097 idx = ic[*xi++]; 3098 do { 3099 m = fm; 3100 fm = fill[m]; 3101 } while (fm < idx); 3102 fill[m] = idx; 3103 fill[idx] = fm; 3104 im[idx] = 0; 3105 } 3106 3107 /* make sure diagonal entry is included */ 3108 if (diagonal_fill && fill[prow] == -1) { 3109 fm = n; 3110 while (fill[fm] < prow) fm = fill[fm]; 3111 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 3112 fill[fm] = prow; 3113 im[prow] = 0; 3114 nzf++; 3115 dcount++; 3116 } 3117 3118 nzi = 0; 3119 row = fill[n]; 3120 while (row < prow) { 3121 incrlev = im[row] + 1; 3122 nz = dloc[row]; 3123 xi = ajnew + ainew[row] + nz + 1; 3124 flev = ajfill + ainew[row] + nz + 1; 3125 nnz = ainew[row+1] - ainew[row] - nz - 1; 3126 fm = row; 3127 while (nnz-- > 0) { 3128 idx = *xi++; 3129 if (*flev + incrlev > levels) { 3130 flev++; 3131 continue; 3132 } 3133 do { 3134 m = fm; 3135 fm = fill[m]; 3136 } while (fm < idx); 3137 if (fm != idx) { 3138 im[idx] = *flev + incrlev; 3139 fill[m] = idx; 3140 fill[idx] = fm; 3141 fm = idx; 3142 nzf++; 3143 } else { 3144 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 3145 } 3146 flev++; 3147 } 3148 row = fill[row]; 3149 nzi++; 3150 } 3151 /* copy new filled row into permanent storage */ 3152 ainew[prow+1] = ainew[prow] + nzf; 3153 if (ainew[prow+1] > jmax) { 3154 3155 /* estimate how much additional space we will need */ 3156 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 3157 /* just double the memory each time */ 3158 int maxadd = jmax; 3159 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 3160 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 3161 jmax += maxadd; 3162 3163 /* allocate a longer ajnew and ajfill */ 3164 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3165 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3166 ierr = PetscFree(ajnew);CHKERRQ(ierr); 3167 ajnew = xi; 3168 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 3169 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 3170 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3171 ajfill = xi; 3172 reallocate++; /* count how many reallocations are needed */ 3173 } 3174 xi = ajnew + ainew[prow]; 3175 flev = ajfill + ainew[prow]; 3176 dloc[prow] = nzi; 3177 fm = fill[n]; 3178 while (nzf--) { 3179 *xi++ = fm; 3180 *flev++ = im[fm]; 3181 fm = fill[fm]; 3182 } 3183 /* make sure row has diagonal entry */ 3184 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 3185 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 3186 try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 3187 } 3188 } 3189 ierr = PetscFree(ajfill);CHKERRQ(ierr); 3190 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 3191 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 3192 ierr = PetscFree(fill);CHKERRQ(ierr); 3193 ierr = PetscFree(im);CHKERRQ(ierr); 3194 3195 { 3196 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3197 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af); 3198 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 3199 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 3200 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 3201 if (diagonal_fill) { 3202 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount); 3203 } 3204 } 3205 3206 /* put together the new matrix */ 3207 ierr = MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);CHKERRQ(ierr); 3208 ierr = MatSetType(*fact,A->type_name);CHKERRQ(ierr); 3209 ierr = MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);CHKERRQ(ierr); 3210 PetscLogObjectParent(*fact,isicol); 3211 b = (Mat_SeqBAIJ*)(*fact)->data; 3212 ierr = PetscFree(b->imax);CHKERRQ(ierr); 3213 b->singlemalloc = PETSC_FALSE; 3214 len = bs2*ainew[n]*sizeof(MatScalar); 3215 /* the next line frees the default space generated by the Create() */ 3216 ierr = PetscFree(b->a);CHKERRQ(ierr); 3217 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 3218 ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 3219 b->j = ajnew; 3220 b->i = ainew; 3221 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3222 b->diag = dloc; 3223 b->ilen = 0; 3224 b->imax = 0; 3225 b->row = isrow; 3226 b->col = iscol; 3227 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3228 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3229 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3230 b->icol = isicol; 3231 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3232 /* In b structure: Free imax, ilen, old a, old j. 3233 Allocate dloc, solve_work, new a, new j */ 3234 PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar)); 3235 b->maxnz = b->nz = ainew[n]; 3236 (*fact)->factor = FACTOR_LU; 3237 3238 (*fact)->info.factor_mallocs = reallocate; 3239 (*fact)->info.fill_ratio_given = f; 3240 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3241 } 3242 3243 if (row_identity && col_identity) { 3244 ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 3245 } 3246 PetscFunctionReturn(0); 3247 } 3248 3249 #undef __FUNCT__ 3250 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE" 3251 int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A) 3252 { 3253 /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */ 3254 /* int i,*AJ=a->j,nz=a->nz; */ 3255 PetscFunctionBegin; 3256 /* Undo Column scaling */ 3257 /* while (nz--) { */ 3258 /* AJ[i] = AJ[i]/4; */ 3259 /* } */ 3260 /* This should really invoke a push/pop logic, but we don't have that yet. */ 3261 A->ops->setunfactored = PETSC_NULL; 3262 PetscFunctionReturn(0); 3263 } 3264 3265 #undef __FUNCT__ 3266 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj" 3267 int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A) 3268 { 3269 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3270 int *AJ=a->j,nz=a->nz; 3271 unsigned short *aj=(unsigned short *)AJ; 3272 PetscFunctionBegin; 3273 /* Is this really necessary? */ 3274 while (nz--) { 3275 AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */ 3276 } 3277 A->ops->setunfactored = PETSC_NULL; 3278 PetscFunctionReturn(0); 3279 } 3280 3281 #undef __FUNCT__ 3282 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3283 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 3284 { 3285 /* 3286 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3287 with natural ordering 3288 */ 3289 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3290 3291 PetscFunctionBegin; 3292 inA->ops->solve = MatSolve_SeqBAIJ_Update; 3293 inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 3294 switch (a->bs) { 3295 case 1: 3296 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3297 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"); 3298 break; 3299 case 2: 3300 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3301 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"); 3302 break; 3303 case 3: 3304 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3305 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"); 3306 break; 3307 case 4: 3308 #if defined(PETSC_USE_MAT_SINGLE) 3309 { 3310 PetscTruth sse_enabled_local; 3311 int ierr; 3312 ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3313 if (sse_enabled_local) { 3314 # if defined(PETSC_HAVE_SSE) 3315 int i,*AJ=a->j,nz=a->nz,n=a->mbs; 3316 if (n==(unsigned short)n) { 3317 unsigned short *aj=(unsigned short *)AJ; 3318 for (i=0;i<nz;i++) { 3319 aj[i] = (unsigned short)AJ[i]; 3320 } 3321 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3322 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj; 3323 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n"); 3324 } else { 3325 /* Scale the column indices for easier indexing in MatSolve. */ 3326 /* for (i=0;i<nz;i++) { */ 3327 /* AJ[i] = AJ[i]*4; */ 3328 /* } */ 3329 inA->ops->setunfactored = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE; 3330 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 3331 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n"); 3332 } 3333 # else 3334 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3335 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3336 # endif 3337 } else { 3338 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3339 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3340 } 3341 } 3342 #else 3343 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3344 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3345 #endif 3346 break; 3347 case 5: 3348 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3349 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"); 3350 break; 3351 case 6: 3352 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3353 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"); 3354 break; 3355 case 7: 3356 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3357 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"); 3358 break; 3359 } 3360 PetscFunctionReturn(0); 3361 } 3362 3363 #undef __FUNCT__ 3364 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3365 int MatSeqBAIJ_UpdateSolvers(Mat A) 3366 { 3367 /* 3368 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3369 with natural ordering 3370 */ 3371 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3372 IS row = a->row, col = a->col; 3373 PetscTruth row_identity, col_identity; 3374 PetscTruth use_natural; 3375 int ierr; 3376 3377 PetscFunctionBegin; 3378 3379 use_natural = PETSC_FALSE; 3380 if (row && col) { 3381 ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3382 ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3383 3384 if (row_identity && col_identity) { 3385 use_natural = PETSC_TRUE; 3386 } 3387 } else { 3388 use_natural = PETSC_TRUE; 3389 } 3390 3391 switch (a->bs) { 3392 case 1: 3393 if (use_natural) { 3394 A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3395 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3396 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"); 3397 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3398 } else { 3399 A->ops->solve = MatSolve_SeqBAIJ_1; 3400 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3401 } 3402 break; 3403 case 2: 3404 if (use_natural) { 3405 A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3406 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3407 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"); 3408 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3409 } else { 3410 A->ops->solve = MatSolve_SeqBAIJ_2; 3411 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3412 } 3413 break; 3414 case 3: 3415 if (use_natural) { 3416 A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3417 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3418 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"); 3419 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3420 } else { 3421 A->ops->solve = MatSolve_SeqBAIJ_3; 3422 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3423 } 3424 break; 3425 case 4: 3426 { 3427 PetscTruth sse_enabled_local; 3428 ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3429 if (use_natural) { 3430 #if defined(PETSC_USE_MAT_SINGLE) 3431 if (sse_enabled_local) { /* Natural + Single + SSE */ 3432 # if defined(PETSC_HAVE_SSE) 3433 int n=a->mbs; 3434 if (n==(unsigned short)n) { 3435 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj; 3436 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n"); 3437 } else { 3438 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 3439 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n"); 3440 } 3441 # else 3442 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3443 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3444 # endif 3445 } else { /* Natural + Single */ 3446 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3447 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n"); 3448 } 3449 #else 3450 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3451 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3452 #endif 3453 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3454 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3455 } else { /* Arbitrary ordering */ 3456 #if defined(PETSC_USE_MAT_SINGLE) 3457 if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3458 # if defined(PETSC_HAVE_SSE) 3459 A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3460 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"); 3461 # else 3462 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3463 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3464 # endif 3465 } else { /* Arbitrary + Single */ 3466 A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3467 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n"); 3468 } 3469 #else 3470 A->ops->solve = MatSolve_SeqBAIJ_4; 3471 #endif 3472 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3473 } 3474 } 3475 break; 3476 case 5: 3477 if (use_natural) { 3478 A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3479 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3480 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3481 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3482 } else { 3483 A->ops->solve = MatSolve_SeqBAIJ_5; 3484 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3485 } 3486 break; 3487 case 6: 3488 if (use_natural) { 3489 A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3490 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3491 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3492 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3493 } else { 3494 A->ops->solve = MatSolve_SeqBAIJ_6; 3495 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3496 } 3497 break; 3498 case 7: 3499 if (use_natural) { 3500 A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3501 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3502 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3503 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3504 } else { 3505 A->ops->solve = MatSolve_SeqBAIJ_7; 3506 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3507 } 3508 break; 3509 default: 3510 A->ops->solve = MatSolve_SeqBAIJ_N; 3511 break; 3512 } 3513 PetscFunctionReturn(0); 3514 } 3515 3516 #undef __FUNCT__ 3517 #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3518 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3519 int ierr; 3520 3521 PetscFunctionBegin; 3522 ierr = MatSeqBAIJ_UpdateSolvers(A); 3523 if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3524 ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3525 } else { 3526 SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3527 } 3528 PetscFunctionReturn(0); 3529 } 3530 3531 #undef __FUNCT__ 3532 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3533 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3534 int ierr; 3535 3536 PetscFunctionBegin; 3537 ierr = MatSeqBAIJ_UpdateSolvers(A); 3538 ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3539 PetscFunctionReturn(0); 3540 } 3541 3542 3543