1 /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/ 2 /* 3 Factorization code for BAIJ format. 4 */ 5 6 #include "src/mat/impls/baij/seq/baij.h" 7 #include "src/vec/vecimpl.h" 8 #include "src/inline/ilu.h" 9 #include "src/inline/dot.h" 10 11 #undef __FUNCT__ 12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering" 13 int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 14 { 15 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 16 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 17 int *diag = a->diag; 18 MatScalar *aa=a->a,*v; 19 PetscScalar s1,*x,*b; 20 21 PetscFunctionBegin; 22 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 23 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 24 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25 26 /* forward solve the U^T */ 27 for (i=0; i<n; i++) { 28 29 v = aa + diag[i]; 30 /* multiply by the inverse of the block diagonal */ 31 s1 = (*v++)*x[i]; 32 vi = aj + diag[i] + 1; 33 nz = ai[i+1] - diag[i] - 1; 34 while (nz--) { 35 x[*vi++] -= (*v++)*s1; 36 } 37 x[i] = s1; 38 } 39 /* backward solve the L^T */ 40 for (i=n-1; i>=0; i--){ 41 v = aa + diag[i] - 1; 42 vi = aj + diag[i] - 1; 43 nz = diag[i] - ai[i]; 44 s1 = x[i]; 45 while (nz--) { 46 x[*vi--] -= (*v--)*s1; 47 } 48 } 49 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 51 PetscLogFlops(2*(a->nz) - A->n); 52 PetscFunctionReturn(0); 53 } 54 55 #undef __FUNCT__ 56 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering" 57 int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 58 { 59 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 60 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 61 int *diag = a->diag,oidx; 62 MatScalar *aa=a->a,*v; 63 PetscScalar s1,s2,x1,x2; 64 PetscScalar *x,*b; 65 66 PetscFunctionBegin; 67 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 68 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 69 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 70 71 /* forward solve the U^T */ 72 idx = 0; 73 for (i=0; i<n; i++) { 74 75 v = aa + 4*diag[i]; 76 /* multiply by the inverse of the block diagonal */ 77 x1 = x[idx]; x2 = x[1+idx]; 78 s1 = v[0]*x1 + v[1]*x2; 79 s2 = v[2]*x1 + v[3]*x2; 80 v += 4; 81 82 vi = aj + diag[i] + 1; 83 nz = ai[i+1] - diag[i] - 1; 84 while (nz--) { 85 oidx = 2*(*vi++); 86 x[oidx] -= v[0]*s1 + v[1]*s2; 87 x[oidx+1] -= v[2]*s1 + v[3]*s2; 88 v += 4; 89 } 90 x[idx] = s1;x[1+idx] = s2; 91 idx += 2; 92 } 93 /* backward solve the L^T */ 94 for (i=n-1; i>=0; i--){ 95 v = aa + 4*diag[i] - 4; 96 vi = aj + diag[i] - 1; 97 nz = diag[i] - ai[i]; 98 idt = 2*i; 99 s1 = x[idt]; s2 = x[1+idt]; 100 while (nz--) { 101 idx = 2*(*vi--); 102 x[idx] -= v[0]*s1 + v[1]*s2; 103 x[idx+1] -= v[2]*s1 + v[3]*s2; 104 v -= 4; 105 } 106 } 107 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 108 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 109 PetscLogFlops(2*4*(a->nz) - 2*A->n); 110 PetscFunctionReturn(0); 111 } 112 113 #undef __FUNCT__ 114 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering" 115 int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 116 { 117 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 118 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 119 int *diag = a->diag,oidx; 120 MatScalar *aa=a->a,*v; 121 PetscScalar s1,s2,s3,x1,x2,x3; 122 PetscScalar *x,*b; 123 124 PetscFunctionBegin; 125 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 126 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 127 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 128 129 /* forward solve the U^T */ 130 idx = 0; 131 for (i=0; i<n; i++) { 132 133 v = aa + 9*diag[i]; 134 /* multiply by the inverse of the block diagonal */ 135 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 136 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 137 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 138 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 139 v += 9; 140 141 vi = aj + diag[i] + 1; 142 nz = ai[i+1] - diag[i] - 1; 143 while (nz--) { 144 oidx = 3*(*vi++); 145 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 146 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 147 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 148 v += 9; 149 } 150 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 151 idx += 3; 152 } 153 /* backward solve the L^T */ 154 for (i=n-1; i>=0; i--){ 155 v = aa + 9*diag[i] - 9; 156 vi = aj + diag[i] - 1; 157 nz = diag[i] - ai[i]; 158 idt = 3*i; 159 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 160 while (nz--) { 161 idx = 3*(*vi--); 162 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 163 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 164 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 165 v -= 9; 166 } 167 } 168 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 169 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 170 PetscLogFlops(2*9*(a->nz) - 3*A->n); 171 PetscFunctionReturn(0); 172 } 173 174 #undef __FUNCT__ 175 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering" 176 int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 177 { 178 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 179 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 180 int *diag = a->diag,oidx; 181 MatScalar *aa=a->a,*v; 182 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 183 PetscScalar *x,*b; 184 185 PetscFunctionBegin; 186 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 187 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 188 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 189 190 /* forward solve the U^T */ 191 idx = 0; 192 for (i=0; i<n; i++) { 193 194 v = aa + 16*diag[i]; 195 /* multiply by the inverse of the block diagonal */ 196 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; 197 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 198 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 199 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 200 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 201 v += 16; 202 203 vi = aj + diag[i] + 1; 204 nz = ai[i+1] - diag[i] - 1; 205 while (nz--) { 206 oidx = 4*(*vi++); 207 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 208 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 209 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 210 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 211 v += 16; 212 } 213 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 214 idx += 4; 215 } 216 /* backward solve the L^T */ 217 for (i=n-1; i>=0; i--){ 218 v = aa + 16*diag[i] - 16; 219 vi = aj + diag[i] - 1; 220 nz = diag[i] - ai[i]; 221 idt = 4*i; 222 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 223 while (nz--) { 224 idx = 4*(*vi--); 225 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 226 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 227 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 228 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 229 v -= 16; 230 } 231 } 232 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 233 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 234 PetscLogFlops(2*16*(a->nz) - 4*A->n); 235 PetscFunctionReturn(0); 236 } 237 238 #undef __FUNCT__ 239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering" 240 int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 241 { 242 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 243 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 244 int *diag = a->diag,oidx; 245 MatScalar *aa=a->a,*v; 246 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 247 PetscScalar *x,*b; 248 249 PetscFunctionBegin; 250 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 251 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 252 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 253 254 /* forward solve the U^T */ 255 idx = 0; 256 for (i=0; i<n; i++) { 257 258 v = aa + 25*diag[i]; 259 /* multiply by the inverse of the block diagonal */ 260 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 261 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 262 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 263 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 264 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 265 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 266 v += 25; 267 268 vi = aj + diag[i] + 1; 269 nz = ai[i+1] - diag[i] - 1; 270 while (nz--) { 271 oidx = 5*(*vi++); 272 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 273 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 274 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 275 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 276 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 277 v += 25; 278 } 279 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 280 idx += 5; 281 } 282 /* backward solve the L^T */ 283 for (i=n-1; i>=0; i--){ 284 v = aa + 25*diag[i] - 25; 285 vi = aj + diag[i] - 1; 286 nz = diag[i] - ai[i]; 287 idt = 5*i; 288 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 289 while (nz--) { 290 idx = 5*(*vi--); 291 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 292 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 293 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 294 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 295 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 296 v -= 25; 297 } 298 } 299 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 300 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 301 PetscLogFlops(2*25*(a->nz) - 5*A->n); 302 PetscFunctionReturn(0); 303 } 304 305 #undef __FUNCT__ 306 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering" 307 int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 308 { 309 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 310 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 311 int *diag = a->diag,oidx; 312 MatScalar *aa=a->a,*v; 313 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 314 PetscScalar *x,*b; 315 316 PetscFunctionBegin; 317 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 318 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 319 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 320 321 /* forward solve the U^T */ 322 idx = 0; 323 for (i=0; i<n; i++) { 324 325 v = aa + 36*diag[i]; 326 /* multiply by the inverse of the block diagonal */ 327 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 328 x6 = x[5+idx]; 329 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 330 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 331 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 332 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 333 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 334 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 335 v += 36; 336 337 vi = aj + diag[i] + 1; 338 nz = ai[i+1] - diag[i] - 1; 339 while (nz--) { 340 oidx = 6*(*vi++); 341 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 342 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 343 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 344 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 345 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 346 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 347 v += 36; 348 } 349 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 350 x[5+idx] = s6; 351 idx += 6; 352 } 353 /* backward solve the L^T */ 354 for (i=n-1; i>=0; i--){ 355 v = aa + 36*diag[i] - 36; 356 vi = aj + diag[i] - 1; 357 nz = diag[i] - ai[i]; 358 idt = 6*i; 359 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 360 s6 = x[5+idt]; 361 while (nz--) { 362 idx = 6*(*vi--); 363 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 364 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 365 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 366 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 367 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 368 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 369 v -= 36; 370 } 371 } 372 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 373 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 374 PetscLogFlops(2*36*(a->nz) - 6*A->n); 375 PetscFunctionReturn(0); 376 } 377 378 #undef __FUNCT__ 379 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering" 380 int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 381 { 382 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 383 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 384 int *diag = a->diag,oidx; 385 MatScalar *aa=a->a,*v; 386 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 387 PetscScalar *x,*b; 388 389 PetscFunctionBegin; 390 ierr = VecCopy(bb,xx);CHKERRQ(ierr); 391 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 392 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 393 394 /* forward solve the U^T */ 395 idx = 0; 396 for (i=0; i<n; i++) { 397 398 v = aa + 49*diag[i]; 399 /* multiply by the inverse of the block diagonal */ 400 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 401 x6 = x[5+idx]; x7 = x[6+idx]; 402 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 403 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 404 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 405 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 406 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 407 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 408 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 409 v += 49; 410 411 vi = aj + diag[i] + 1; 412 nz = ai[i+1] - diag[i] - 1; 413 while (nz--) { 414 oidx = 7*(*vi++); 415 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 416 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 417 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 418 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 419 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 420 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 421 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 422 v += 49; 423 } 424 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 425 x[5+idx] = s6;x[6+idx] = s7; 426 idx += 7; 427 } 428 /* backward solve the L^T */ 429 for (i=n-1; i>=0; i--){ 430 v = aa + 49*diag[i] - 49; 431 vi = aj + diag[i] - 1; 432 nz = diag[i] - ai[i]; 433 idt = 7*i; 434 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 435 s6 = x[5+idt];s7 = x[6+idt]; 436 while (nz--) { 437 idx = 7*(*vi--); 438 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 439 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 440 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 441 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 442 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 443 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 444 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 445 v -= 49; 446 } 447 } 448 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 449 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 450 PetscLogFlops(2*49*(a->nz) - 7*A->n); 451 PetscFunctionReturn(0); 452 } 453 454 /*---------------------------------------------------------------------------------------------*/ 455 #undef __FUNCT__ 456 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1" 457 int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 458 { 459 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 460 IS iscol=a->col,isrow=a->row; 461 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 462 int *diag = a->diag; 463 MatScalar *aa=a->a,*v; 464 PetscScalar s1,*x,*b,*t; 465 466 PetscFunctionBegin; 467 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 468 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 469 t = a->solve_work; 470 471 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 472 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 473 474 /* copy the b into temp work space according to permutation */ 475 for (i=0; i<n; i++) { 476 t[i] = b[c[i]]; 477 } 478 479 /* forward solve the U^T */ 480 for (i=0; i<n; i++) { 481 482 v = aa + diag[i]; 483 /* multiply by the inverse of the block diagonal */ 484 s1 = (*v++)*t[i]; 485 vi = aj + diag[i] + 1; 486 nz = ai[i+1] - diag[i] - 1; 487 while (nz--) { 488 t[*vi++] -= (*v++)*s1; 489 } 490 t[i] = s1; 491 } 492 /* backward solve the L^T */ 493 for (i=n-1; i>=0; i--){ 494 v = aa + diag[i] - 1; 495 vi = aj + diag[i] - 1; 496 nz = diag[i] - ai[i]; 497 s1 = t[i]; 498 while (nz--) { 499 t[*vi--] -= (*v--)*s1; 500 } 501 } 502 503 /* copy t into x according to permutation */ 504 for (i=0; i<n; i++) { 505 x[r[i]] = t[i]; 506 } 507 508 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 509 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 510 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 511 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 512 PetscLogFlops(2*(a->nz) - A->n); 513 PetscFunctionReturn(0); 514 } 515 516 #undef __FUNCT__ 517 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2" 518 int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 519 { 520 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 521 IS iscol=a->col,isrow=a->row; 522 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 523 int *diag = a->diag,ii,ic,ir,oidx; 524 MatScalar *aa=a->a,*v; 525 PetscScalar s1,s2,x1,x2; 526 PetscScalar *x,*b,*t; 527 528 PetscFunctionBegin; 529 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 530 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 531 t = a->solve_work; 532 533 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 534 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 535 536 /* copy the b into temp work space according to permutation */ 537 ii = 0; 538 for (i=0; i<n; i++) { 539 ic = 2*c[i]; 540 t[ii] = b[ic]; 541 t[ii+1] = b[ic+1]; 542 ii += 2; 543 } 544 545 /* forward solve the U^T */ 546 idx = 0; 547 for (i=0; i<n; i++) { 548 549 v = aa + 4*diag[i]; 550 /* multiply by the inverse of the block diagonal */ 551 x1 = t[idx]; x2 = t[1+idx]; 552 s1 = v[0]*x1 + v[1]*x2; 553 s2 = v[2]*x1 + v[3]*x2; 554 v += 4; 555 556 vi = aj + diag[i] + 1; 557 nz = ai[i+1] - diag[i] - 1; 558 while (nz--) { 559 oidx = 2*(*vi++); 560 t[oidx] -= v[0]*s1 + v[1]*s2; 561 t[oidx+1] -= v[2]*s1 + v[3]*s2; 562 v += 4; 563 } 564 t[idx] = s1;t[1+idx] = s2; 565 idx += 2; 566 } 567 /* backward solve the L^T */ 568 for (i=n-1; i>=0; i--){ 569 v = aa + 4*diag[i] - 4; 570 vi = aj + diag[i] - 1; 571 nz = diag[i] - ai[i]; 572 idt = 2*i; 573 s1 = t[idt]; s2 = t[1+idt]; 574 while (nz--) { 575 idx = 2*(*vi--); 576 t[idx] -= v[0]*s1 + v[1]*s2; 577 t[idx+1] -= v[2]*s1 + v[3]*s2; 578 v -= 4; 579 } 580 } 581 582 /* copy t into x according to permutation */ 583 ii = 0; 584 for (i=0; i<n; i++) { 585 ir = 2*r[i]; 586 x[ir] = t[ii]; 587 x[ir+1] = t[ii+1]; 588 ii += 2; 589 } 590 591 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 592 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 593 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 594 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 595 PetscLogFlops(2*4*(a->nz) - 2*A->n); 596 PetscFunctionReturn(0); 597 } 598 599 #undef __FUNCT__ 600 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3" 601 int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 602 { 603 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 604 IS iscol=a->col,isrow=a->row; 605 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 606 int *diag = a->diag,ii,ic,ir,oidx; 607 MatScalar *aa=a->a,*v; 608 PetscScalar s1,s2,s3,x1,x2,x3; 609 PetscScalar *x,*b,*t; 610 611 PetscFunctionBegin; 612 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 613 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 614 t = a->solve_work; 615 616 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 617 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 618 619 /* copy the b into temp work space according to permutation */ 620 ii = 0; 621 for (i=0; i<n; i++) { 622 ic = 3*c[i]; 623 t[ii] = b[ic]; 624 t[ii+1] = b[ic+1]; 625 t[ii+2] = b[ic+2]; 626 ii += 3; 627 } 628 629 /* forward solve the U^T */ 630 idx = 0; 631 for (i=0; i<n; i++) { 632 633 v = aa + 9*diag[i]; 634 /* multiply by the inverse of the block diagonal */ 635 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 636 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 637 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 638 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 639 v += 9; 640 641 vi = aj + diag[i] + 1; 642 nz = ai[i+1] - diag[i] - 1; 643 while (nz--) { 644 oidx = 3*(*vi++); 645 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 646 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 647 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 648 v += 9; 649 } 650 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 651 idx += 3; 652 } 653 /* backward solve the L^T */ 654 for (i=n-1; i>=0; i--){ 655 v = aa + 9*diag[i] - 9; 656 vi = aj + diag[i] - 1; 657 nz = diag[i] - ai[i]; 658 idt = 3*i; 659 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 660 while (nz--) { 661 idx = 3*(*vi--); 662 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 663 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 664 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 665 v -= 9; 666 } 667 } 668 669 /* copy t into x according to permutation */ 670 ii = 0; 671 for (i=0; i<n; i++) { 672 ir = 3*r[i]; 673 x[ir] = t[ii]; 674 x[ir+1] = t[ii+1]; 675 x[ir+2] = t[ii+2]; 676 ii += 3; 677 } 678 679 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 680 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 681 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 682 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 683 PetscLogFlops(2*9*(a->nz) - 3*A->n); 684 PetscFunctionReturn(0); 685 } 686 687 #undef __FUNCT__ 688 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4" 689 int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 690 { 691 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 692 IS iscol=a->col,isrow=a->row; 693 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 694 int *diag = a->diag,ii,ic,ir,oidx; 695 MatScalar *aa=a->a,*v; 696 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 697 PetscScalar *x,*b,*t; 698 699 PetscFunctionBegin; 700 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 701 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 702 t = a->solve_work; 703 704 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 705 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 706 707 /* copy the b into temp work space according to permutation */ 708 ii = 0; 709 for (i=0; i<n; i++) { 710 ic = 4*c[i]; 711 t[ii] = b[ic]; 712 t[ii+1] = b[ic+1]; 713 t[ii+2] = b[ic+2]; 714 t[ii+3] = b[ic+3]; 715 ii += 4; 716 } 717 718 /* forward solve the U^T */ 719 idx = 0; 720 for (i=0; i<n; i++) { 721 722 v = aa + 16*diag[i]; 723 /* multiply by the inverse of the block diagonal */ 724 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 725 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 726 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 727 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 728 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 729 v += 16; 730 731 vi = aj + diag[i] + 1; 732 nz = ai[i+1] - diag[i] - 1; 733 while (nz--) { 734 oidx = 4*(*vi++); 735 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 736 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 737 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 738 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 739 v += 16; 740 } 741 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 742 idx += 4; 743 } 744 /* backward solve the L^T */ 745 for (i=n-1; i>=0; i--){ 746 v = aa + 16*diag[i] - 16; 747 vi = aj + diag[i] - 1; 748 nz = diag[i] - ai[i]; 749 idt = 4*i; 750 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 751 while (nz--) { 752 idx = 4*(*vi--); 753 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 754 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 755 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 756 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 757 v -= 16; 758 } 759 } 760 761 /* copy t into x according to permutation */ 762 ii = 0; 763 for (i=0; i<n; i++) { 764 ir = 4*r[i]; 765 x[ir] = t[ii]; 766 x[ir+1] = t[ii+1]; 767 x[ir+2] = t[ii+2]; 768 x[ir+3] = t[ii+3]; 769 ii += 4; 770 } 771 772 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 773 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 774 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 775 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 776 PetscLogFlops(2*16*(a->nz) - 4*A->n); 777 PetscFunctionReturn(0); 778 } 779 780 #undef __FUNCT__ 781 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5" 782 int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 783 { 784 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 785 IS iscol=a->col,isrow=a->row; 786 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 787 int *diag = a->diag,ii,ic,ir,oidx; 788 MatScalar *aa=a->a,*v; 789 PetscScalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 790 PetscScalar *x,*b,*t; 791 792 PetscFunctionBegin; 793 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 794 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 795 t = a->solve_work; 796 797 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 798 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 799 800 /* copy the b into temp work space according to permutation */ 801 ii = 0; 802 for (i=0; i<n; i++) { 803 ic = 5*c[i]; 804 t[ii] = b[ic]; 805 t[ii+1] = b[ic+1]; 806 t[ii+2] = b[ic+2]; 807 t[ii+3] = b[ic+3]; 808 t[ii+4] = b[ic+4]; 809 ii += 5; 810 } 811 812 /* forward solve the U^T */ 813 idx = 0; 814 for (i=0; i<n; i++) { 815 816 v = aa + 25*diag[i]; 817 /* multiply by the inverse of the block diagonal */ 818 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 819 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 820 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 821 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 822 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 823 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 824 v += 25; 825 826 vi = aj + diag[i] + 1; 827 nz = ai[i+1] - diag[i] - 1; 828 while (nz--) { 829 oidx = 5*(*vi++); 830 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 831 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 832 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 833 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 834 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 835 v += 25; 836 } 837 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 838 idx += 5; 839 } 840 /* backward solve the L^T */ 841 for (i=n-1; i>=0; i--){ 842 v = aa + 25*diag[i] - 25; 843 vi = aj + diag[i] - 1; 844 nz = diag[i] - ai[i]; 845 idt = 5*i; 846 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 847 while (nz--) { 848 idx = 5*(*vi--); 849 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 850 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 851 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 852 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 853 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 854 v -= 25; 855 } 856 } 857 858 /* copy t into x according to permutation */ 859 ii = 0; 860 for (i=0; i<n; i++) { 861 ir = 5*r[i]; 862 x[ir] = t[ii]; 863 x[ir+1] = t[ii+1]; 864 x[ir+2] = t[ii+2]; 865 x[ir+3] = t[ii+3]; 866 x[ir+4] = t[ii+4]; 867 ii += 5; 868 } 869 870 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 871 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 872 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 873 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 874 PetscLogFlops(2*25*(a->nz) - 5*A->n); 875 PetscFunctionReturn(0); 876 } 877 878 #undef __FUNCT__ 879 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6" 880 int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 881 { 882 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 883 IS iscol=a->col,isrow=a->row; 884 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 885 int *diag = a->diag,ii,ic,ir,oidx; 886 MatScalar *aa=a->a,*v; 887 PetscScalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 888 PetscScalar *x,*b,*t; 889 890 PetscFunctionBegin; 891 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 892 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 893 t = a->solve_work; 894 895 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 896 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 897 898 /* copy the b into temp work space according to permutation */ 899 ii = 0; 900 for (i=0; i<n; i++) { 901 ic = 6*c[i]; 902 t[ii] = b[ic]; 903 t[ii+1] = b[ic+1]; 904 t[ii+2] = b[ic+2]; 905 t[ii+3] = b[ic+3]; 906 t[ii+4] = b[ic+4]; 907 t[ii+5] = b[ic+5]; 908 ii += 6; 909 } 910 911 /* forward solve the U^T */ 912 idx = 0; 913 for (i=0; i<n; i++) { 914 915 v = aa + 36*diag[i]; 916 /* multiply by the inverse of the block diagonal */ 917 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 918 x6 = t[5+idx]; 919 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 920 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 921 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 922 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 923 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 924 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 925 v += 36; 926 927 vi = aj + diag[i] + 1; 928 nz = ai[i+1] - diag[i] - 1; 929 while (nz--) { 930 oidx = 6*(*vi++); 931 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 932 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 933 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 934 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 935 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 936 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 937 v += 36; 938 } 939 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 940 t[5+idx] = s6; 941 idx += 6; 942 } 943 /* backward solve the L^T */ 944 for (i=n-1; i>=0; i--){ 945 v = aa + 36*diag[i] - 36; 946 vi = aj + diag[i] - 1; 947 nz = diag[i] - ai[i]; 948 idt = 6*i; 949 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 950 s6 = t[5+idt]; 951 while (nz--) { 952 idx = 6*(*vi--); 953 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 954 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 955 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 956 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 957 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 958 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 959 v -= 36; 960 } 961 } 962 963 /* copy t into x according to permutation */ 964 ii = 0; 965 for (i=0; i<n; i++) { 966 ir = 6*r[i]; 967 x[ir] = t[ii]; 968 x[ir+1] = t[ii+1]; 969 x[ir+2] = t[ii+2]; 970 x[ir+3] = t[ii+3]; 971 x[ir+4] = t[ii+4]; 972 x[ir+5] = t[ii+5]; 973 ii += 6; 974 } 975 976 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 977 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 978 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 979 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 980 PetscLogFlops(2*36*(a->nz) - 6*A->n); 981 PetscFunctionReturn(0); 982 } 983 984 #undef __FUNCT__ 985 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7" 986 int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 987 { 988 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 989 IS iscol=a->col,isrow=a->row; 990 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 991 int *diag = a->diag,ii,ic,ir,oidx; 992 MatScalar *aa=a->a,*v; 993 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 994 PetscScalar *x,*b,*t; 995 996 PetscFunctionBegin; 997 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 998 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 999 t = a->solve_work; 1000 1001 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1002 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 1003 1004 /* copy the b into temp work space according to permutation */ 1005 ii = 0; 1006 for (i=0; i<n; i++) { 1007 ic = 7*c[i]; 1008 t[ii] = b[ic]; 1009 t[ii+1] = b[ic+1]; 1010 t[ii+2] = b[ic+2]; 1011 t[ii+3] = b[ic+3]; 1012 t[ii+4] = b[ic+4]; 1013 t[ii+5] = b[ic+5]; 1014 t[ii+6] = b[ic+6]; 1015 ii += 7; 1016 } 1017 1018 /* forward solve the U^T */ 1019 idx = 0; 1020 for (i=0; i<n; i++) { 1021 1022 v = aa + 49*diag[i]; 1023 /* multiply by the inverse of the block diagonal */ 1024 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1025 x6 = t[5+idx]; x7 = t[6+idx]; 1026 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1027 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1028 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1029 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1030 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1031 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1032 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1033 v += 49; 1034 1035 vi = aj + diag[i] + 1; 1036 nz = ai[i+1] - diag[i] - 1; 1037 while (nz--) { 1038 oidx = 7*(*vi++); 1039 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1040 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1041 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1042 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1043 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1044 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1045 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1046 v += 49; 1047 } 1048 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1049 t[5+idx] = s6;t[6+idx] = s7; 1050 idx += 7; 1051 } 1052 /* backward solve the L^T */ 1053 for (i=n-1; i>=0; i--){ 1054 v = aa + 49*diag[i] - 49; 1055 vi = aj + diag[i] - 1; 1056 nz = diag[i] - ai[i]; 1057 idt = 7*i; 1058 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1059 s6 = t[5+idt];s7 = t[6+idt]; 1060 while (nz--) { 1061 idx = 7*(*vi--); 1062 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1063 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1064 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1065 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1066 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1067 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1068 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1069 v -= 49; 1070 } 1071 } 1072 1073 /* copy t into x according to permutation */ 1074 ii = 0; 1075 for (i=0; i<n; i++) { 1076 ir = 7*r[i]; 1077 x[ir] = t[ii]; 1078 x[ir+1] = t[ii+1]; 1079 x[ir+2] = t[ii+2]; 1080 x[ir+3] = t[ii+3]; 1081 x[ir+4] = t[ii+4]; 1082 x[ir+5] = t[ii+5]; 1083 x[ir+6] = t[ii+6]; 1084 ii += 7; 1085 } 1086 1087 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1088 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1089 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1090 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1091 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1092 PetscFunctionReturn(0); 1093 } 1094 1095 /* ----------------------------------------------------------- */ 1096 #undef __FUNCT__ 1097 #define __FUNCT__ "MatSolve_SeqBAIJ_N" 1098 int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx) 1099 { 1100 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1101 IS iscol=a->col,isrow=a->row; 1102 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1103 int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 1104 MatScalar *aa=a->a,*v; 1105 PetscScalar *x,*b,*s,*t,*ls; 1106 1107 PetscFunctionBegin; 1108 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1109 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1110 t = a->solve_work; 1111 1112 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1113 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1114 1115 /* forward solve the lower triangular */ 1116 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1117 for (i=1; i<n; i++) { 1118 v = aa + bs2*ai[i]; 1119 vi = aj + ai[i]; 1120 nz = a->diag[i] - ai[i]; 1121 s = t + bs*i; 1122 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr); 1123 while (nz--) { 1124 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1125 v += bs2; 1126 } 1127 } 1128 /* backward solve the upper triangular */ 1129 ls = a->solve_work + A->n; 1130 for (i=n-1; i>=0; i--){ 1131 v = aa + bs2*(a->diag[i] + 1); 1132 vi = aj + a->diag[i] + 1; 1133 nz = ai[i+1] - a->diag[i] - 1; 1134 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1135 while (nz--) { 1136 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1137 v += bs2; 1138 } 1139 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1140 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr); 1141 } 1142 1143 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1144 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1145 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1146 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1147 PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n); 1148 PetscFunctionReturn(0); 1149 } 1150 1151 #undef __FUNCT__ 1152 #define __FUNCT__ "MatSolve_SeqBAIJ_7" 1153 int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx) 1154 { 1155 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1156 IS iscol=a->col,isrow=a->row; 1157 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1158 int *diag = a->diag; 1159 MatScalar *aa=a->a,*v; 1160 PetscScalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1161 PetscScalar *x,*b,*t; 1162 1163 PetscFunctionBegin; 1164 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1165 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1166 t = a->solve_work; 1167 1168 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1169 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1170 1171 /* forward solve the lower triangular */ 1172 idx = 7*(*r++); 1173 t[0] = b[idx]; t[1] = b[1+idx]; 1174 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1175 t[5] = b[5+idx]; t[6] = b[6+idx]; 1176 1177 for (i=1; i<n; i++) { 1178 v = aa + 49*ai[i]; 1179 vi = aj + ai[i]; 1180 nz = diag[i] - ai[i]; 1181 idx = 7*(*r++); 1182 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1183 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1184 while (nz--) { 1185 idx = 7*(*vi++); 1186 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1187 x4 = t[3+idx];x5 = t[4+idx]; 1188 x6 = t[5+idx];x7 = t[6+idx]; 1189 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1190 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1191 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1192 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1193 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1194 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1195 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1196 v += 49; 1197 } 1198 idx = 7*i; 1199 t[idx] = s1;t[1+idx] = s2; 1200 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1201 t[5+idx] = s6;t[6+idx] = s7; 1202 } 1203 /* backward solve the upper triangular */ 1204 for (i=n-1; i>=0; i--){ 1205 v = aa + 49*diag[i] + 49; 1206 vi = aj + diag[i] + 1; 1207 nz = ai[i+1] - diag[i] - 1; 1208 idt = 7*i; 1209 s1 = t[idt]; s2 = t[1+idt]; 1210 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1211 s6 = t[5+idt];s7 = t[6+idt]; 1212 while (nz--) { 1213 idx = 7*(*vi++); 1214 x1 = t[idx]; x2 = t[1+idx]; 1215 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1216 x6 = t[5+idx]; x7 = t[6+idx]; 1217 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1218 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1219 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1220 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1221 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1222 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1223 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1224 v += 49; 1225 } 1226 idc = 7*(*c--); 1227 v = aa + 49*diag[i]; 1228 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1229 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1230 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1231 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1232 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1233 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1234 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1235 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1236 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1237 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1238 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1239 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1240 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1241 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1242 } 1243 1244 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1245 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1246 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1247 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1248 PetscLogFlops(2*49*(a->nz) - 7*A->n); 1249 PetscFunctionReturn(0); 1250 } 1251 1252 #undef __FUNCT__ 1253 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering" 1254 int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1255 { 1256 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1257 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1258 int ierr,*diag = a->diag,jdx; 1259 MatScalar *aa=a->a,*v; 1260 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1261 1262 PetscFunctionBegin; 1263 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1264 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1265 /* forward solve the lower triangular */ 1266 idx = 0; 1267 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1268 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1269 x[6] = b[6+idx]; 1270 for (i=1; i<n; i++) { 1271 v = aa + 49*ai[i]; 1272 vi = aj + ai[i]; 1273 nz = diag[i] - ai[i]; 1274 idx = 7*i; 1275 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1276 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1277 s7 = b[6+idx]; 1278 while (nz--) { 1279 jdx = 7*(*vi++); 1280 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1281 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1282 x7 = x[6+jdx]; 1283 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1284 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1285 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1286 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1287 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1288 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1289 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1290 v += 49; 1291 } 1292 x[idx] = s1; 1293 x[1+idx] = s2; 1294 x[2+idx] = s3; 1295 x[3+idx] = s4; 1296 x[4+idx] = s5; 1297 x[5+idx] = s6; 1298 x[6+idx] = s7; 1299 } 1300 /* backward solve the upper triangular */ 1301 for (i=n-1; i>=0; i--){ 1302 v = aa + 49*diag[i] + 49; 1303 vi = aj + diag[i] + 1; 1304 nz = ai[i+1] - diag[i] - 1; 1305 idt = 7*i; 1306 s1 = x[idt]; s2 = x[1+idt]; 1307 s3 = x[2+idt]; s4 = x[3+idt]; 1308 s5 = x[4+idt]; s6 = x[5+idt]; 1309 s7 = x[6+idt]; 1310 while (nz--) { 1311 idx = 7*(*vi++); 1312 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1313 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1314 x7 = x[6+idx]; 1315 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1316 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1317 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1318 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1319 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1320 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1321 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1322 v += 49; 1323 } 1324 v = aa + 49*diag[i]; 1325 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1326 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1327 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1328 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1329 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1330 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1331 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1332 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1333 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1334 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1335 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1336 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1337 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1338 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1339 } 1340 1341 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1342 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1343 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1344 PetscFunctionReturn(0); 1345 } 1346 1347 #undef __FUNCT__ 1348 #define __FUNCT__ "MatSolve_SeqBAIJ_6" 1349 int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx) 1350 { 1351 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1352 IS iscol=a->col,isrow=a->row; 1353 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1354 int *diag = a->diag; 1355 MatScalar *aa=a->a,*v; 1356 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1357 1358 PetscFunctionBegin; 1359 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1360 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1361 t = a->solve_work; 1362 1363 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1364 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1365 1366 /* forward solve the lower triangular */ 1367 idx = 6*(*r++); 1368 t[0] = b[idx]; t[1] = b[1+idx]; 1369 t[2] = b[2+idx]; t[3] = b[3+idx]; 1370 t[4] = b[4+idx]; t[5] = b[5+idx]; 1371 for (i=1; i<n; i++) { 1372 v = aa + 36*ai[i]; 1373 vi = aj + ai[i]; 1374 nz = diag[i] - ai[i]; 1375 idx = 6*(*r++); 1376 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1377 s5 = b[4+idx]; s6 = b[5+idx]; 1378 while (nz--) { 1379 idx = 6*(*vi++); 1380 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1381 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1382 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1383 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1384 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1385 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1386 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1387 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1388 v += 36; 1389 } 1390 idx = 6*i; 1391 t[idx] = s1;t[1+idx] = s2; 1392 t[2+idx] = s3;t[3+idx] = s4; 1393 t[4+idx] = s5;t[5+idx] = s6; 1394 } 1395 /* backward solve the upper triangular */ 1396 for (i=n-1; i>=0; i--){ 1397 v = aa + 36*diag[i] + 36; 1398 vi = aj + diag[i] + 1; 1399 nz = ai[i+1] - diag[i] - 1; 1400 idt = 6*i; 1401 s1 = t[idt]; s2 = t[1+idt]; 1402 s3 = t[2+idt];s4 = t[3+idt]; 1403 s5 = t[4+idt];s6 = t[5+idt]; 1404 while (nz--) { 1405 idx = 6*(*vi++); 1406 x1 = t[idx]; x2 = t[1+idx]; 1407 x3 = t[2+idx]; x4 = t[3+idx]; 1408 x5 = t[4+idx]; x6 = t[5+idx]; 1409 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1410 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1411 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1412 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1413 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1414 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1415 v += 36; 1416 } 1417 idc = 6*(*c--); 1418 v = aa + 36*diag[i]; 1419 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1420 v[18]*s4+v[24]*s5+v[30]*s6; 1421 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1422 v[19]*s4+v[25]*s5+v[31]*s6; 1423 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1424 v[20]*s4+v[26]*s5+v[32]*s6; 1425 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1426 v[21]*s4+v[27]*s5+v[33]*s6; 1427 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1428 v[22]*s4+v[28]*s5+v[34]*s6; 1429 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1430 v[23]*s4+v[29]*s5+v[35]*s6; 1431 } 1432 1433 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1434 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1435 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1436 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1437 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1438 PetscFunctionReturn(0); 1439 } 1440 1441 #undef __FUNCT__ 1442 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering" 1443 int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1444 { 1445 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1446 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1447 int ierr,*diag = a->diag,jdx; 1448 MatScalar *aa=a->a,*v; 1449 PetscScalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1450 1451 PetscFunctionBegin; 1452 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1453 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1454 /* forward solve the lower triangular */ 1455 idx = 0; 1456 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1457 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1458 for (i=1; i<n; i++) { 1459 v = aa + 36*ai[i]; 1460 vi = aj + ai[i]; 1461 nz = diag[i] - ai[i]; 1462 idx = 6*i; 1463 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1464 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1465 while (nz--) { 1466 jdx = 6*(*vi++); 1467 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1468 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1469 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1470 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1471 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1472 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1473 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1474 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1475 v += 36; 1476 } 1477 x[idx] = s1; 1478 x[1+idx] = s2; 1479 x[2+idx] = s3; 1480 x[3+idx] = s4; 1481 x[4+idx] = s5; 1482 x[5+idx] = s6; 1483 } 1484 /* backward solve the upper triangular */ 1485 for (i=n-1; i>=0; i--){ 1486 v = aa + 36*diag[i] + 36; 1487 vi = aj + diag[i] + 1; 1488 nz = ai[i+1] - diag[i] - 1; 1489 idt = 6*i; 1490 s1 = x[idt]; s2 = x[1+idt]; 1491 s3 = x[2+idt]; s4 = x[3+idt]; 1492 s5 = x[4+idt]; s6 = x[5+idt]; 1493 while (nz--) { 1494 idx = 6*(*vi++); 1495 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1496 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1497 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1498 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1499 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1500 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1501 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1502 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1503 v += 36; 1504 } 1505 v = aa + 36*diag[i]; 1506 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1507 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1508 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1509 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1510 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1511 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1512 } 1513 1514 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1515 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1516 PetscLogFlops(2*36*(a->nz) - 6*A->n); 1517 PetscFunctionReturn(0); 1518 } 1519 1520 #undef __FUNCT__ 1521 #define __FUNCT__ "MatSolve_SeqBAIJ_5" 1522 int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx) 1523 { 1524 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1525 IS iscol=a->col,isrow=a->row; 1526 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1527 int *diag = a->diag; 1528 MatScalar *aa=a->a,*v; 1529 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1530 1531 PetscFunctionBegin; 1532 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1533 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1534 t = a->solve_work; 1535 1536 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1537 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1538 1539 /* forward solve the lower triangular */ 1540 idx = 5*(*r++); 1541 t[0] = b[idx]; t[1] = b[1+idx]; 1542 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1543 for (i=1; i<n; i++) { 1544 v = aa + 25*ai[i]; 1545 vi = aj + ai[i]; 1546 nz = diag[i] - ai[i]; 1547 idx = 5*(*r++); 1548 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1549 s5 = b[4+idx]; 1550 while (nz--) { 1551 idx = 5*(*vi++); 1552 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1553 x4 = t[3+idx];x5 = t[4+idx]; 1554 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1555 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1556 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1557 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1558 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1559 v += 25; 1560 } 1561 idx = 5*i; 1562 t[idx] = s1;t[1+idx] = s2; 1563 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1564 } 1565 /* backward solve the upper triangular */ 1566 for (i=n-1; i>=0; i--){ 1567 v = aa + 25*diag[i] + 25; 1568 vi = aj + diag[i] + 1; 1569 nz = ai[i+1] - diag[i] - 1; 1570 idt = 5*i; 1571 s1 = t[idt]; s2 = t[1+idt]; 1572 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1573 while (nz--) { 1574 idx = 5*(*vi++); 1575 x1 = t[idx]; x2 = t[1+idx]; 1576 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1577 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1578 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1579 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1580 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1581 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1582 v += 25; 1583 } 1584 idc = 5*(*c--); 1585 v = aa + 25*diag[i]; 1586 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1587 v[15]*s4+v[20]*s5; 1588 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1589 v[16]*s4+v[21]*s5; 1590 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1591 v[17]*s4+v[22]*s5; 1592 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1593 v[18]*s4+v[23]*s5; 1594 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1595 v[19]*s4+v[24]*s5; 1596 } 1597 1598 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1599 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1600 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1601 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1602 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1603 PetscFunctionReturn(0); 1604 } 1605 1606 #undef __FUNCT__ 1607 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering" 1608 int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1609 { 1610 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1611 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1612 int ierr,*diag = a->diag,jdx; 1613 MatScalar *aa=a->a,*v; 1614 PetscScalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1615 1616 PetscFunctionBegin; 1617 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1618 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1619 /* forward solve the lower triangular */ 1620 idx = 0; 1621 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1622 for (i=1; i<n; i++) { 1623 v = aa + 25*ai[i]; 1624 vi = aj + ai[i]; 1625 nz = diag[i] - ai[i]; 1626 idx = 5*i; 1627 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1628 while (nz--) { 1629 jdx = 5*(*vi++); 1630 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1631 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1632 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1633 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1634 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1635 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1636 v += 25; 1637 } 1638 x[idx] = s1; 1639 x[1+idx] = s2; 1640 x[2+idx] = s3; 1641 x[3+idx] = s4; 1642 x[4+idx] = s5; 1643 } 1644 /* backward solve the upper triangular */ 1645 for (i=n-1; i>=0; i--){ 1646 v = aa + 25*diag[i] + 25; 1647 vi = aj + diag[i] + 1; 1648 nz = ai[i+1] - diag[i] - 1; 1649 idt = 5*i; 1650 s1 = x[idt]; s2 = x[1+idt]; 1651 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1652 while (nz--) { 1653 idx = 5*(*vi++); 1654 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1655 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1656 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1657 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1658 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1659 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1660 v += 25; 1661 } 1662 v = aa + 25*diag[i]; 1663 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1664 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1665 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1666 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1667 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1668 } 1669 1670 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1671 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1672 PetscLogFlops(2*25*(a->nz) - 5*A->n); 1673 PetscFunctionReturn(0); 1674 } 1675 1676 #undef __FUNCT__ 1677 #define __FUNCT__ "MatSolve_SeqBAIJ_4" 1678 int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx) 1679 { 1680 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1681 IS iscol=a->col,isrow=a->row; 1682 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1683 int *diag = a->diag; 1684 MatScalar *aa=a->a,*v; 1685 PetscScalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1686 1687 PetscFunctionBegin; 1688 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1689 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1690 t = a->solve_work; 1691 1692 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1693 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1694 1695 /* forward solve the lower triangular */ 1696 idx = 4*(*r++); 1697 t[0] = b[idx]; t[1] = b[1+idx]; 1698 t[2] = b[2+idx]; t[3] = b[3+idx]; 1699 for (i=1; i<n; i++) { 1700 v = aa + 16*ai[i]; 1701 vi = aj + ai[i]; 1702 nz = diag[i] - ai[i]; 1703 idx = 4*(*r++); 1704 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1705 while (nz--) { 1706 idx = 4*(*vi++); 1707 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1708 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1709 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1710 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1711 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1712 v += 16; 1713 } 1714 idx = 4*i; 1715 t[idx] = s1;t[1+idx] = s2; 1716 t[2+idx] = s3;t[3+idx] = s4; 1717 } 1718 /* backward solve the upper triangular */ 1719 for (i=n-1; i>=0; i--){ 1720 v = aa + 16*diag[i] + 16; 1721 vi = aj + diag[i] + 1; 1722 nz = ai[i+1] - diag[i] - 1; 1723 idt = 4*i; 1724 s1 = t[idt]; s2 = t[1+idt]; 1725 s3 = t[2+idt];s4 = t[3+idt]; 1726 while (nz--) { 1727 idx = 4*(*vi++); 1728 x1 = t[idx]; x2 = t[1+idx]; 1729 x3 = t[2+idx]; x4 = t[3+idx]; 1730 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1731 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1732 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1733 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1734 v += 16; 1735 } 1736 idc = 4*(*c--); 1737 v = aa + 16*diag[i]; 1738 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1739 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1740 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1741 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1742 } 1743 1744 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1745 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1746 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1747 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1748 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1749 PetscFunctionReturn(0); 1750 } 1751 1752 #undef __FUNCT__ 1753 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion" 1754 int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx) 1755 { 1756 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1757 IS iscol=a->col,isrow=a->row; 1758 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1759 int *diag = a->diag; 1760 MatScalar *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1761 PetscScalar *x,*b; 1762 1763 PetscFunctionBegin; 1764 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1765 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1766 t = (MatScalar *)a->solve_work; 1767 1768 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1769 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1770 1771 /* forward solve the lower triangular */ 1772 idx = 4*(*r++); 1773 t[0] = (MatScalar)b[idx]; 1774 t[1] = (MatScalar)b[1+idx]; 1775 t[2] = (MatScalar)b[2+idx]; 1776 t[3] = (MatScalar)b[3+idx]; 1777 for (i=1; i<n; i++) { 1778 v = aa + 16*ai[i]; 1779 vi = aj + ai[i]; 1780 nz = diag[i] - ai[i]; 1781 idx = 4*(*r++); 1782 s1 = (MatScalar)b[idx]; 1783 s2 = (MatScalar)b[1+idx]; 1784 s3 = (MatScalar)b[2+idx]; 1785 s4 = (MatScalar)b[3+idx]; 1786 while (nz--) { 1787 idx = 4*(*vi++); 1788 x1 = t[idx]; 1789 x2 = t[1+idx]; 1790 x3 = t[2+idx]; 1791 x4 = t[3+idx]; 1792 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1793 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1794 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1795 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1796 v += 16; 1797 } 1798 idx = 4*i; 1799 t[idx] = s1; 1800 t[1+idx] = s2; 1801 t[2+idx] = s3; 1802 t[3+idx] = s4; 1803 } 1804 /* backward solve the upper triangular */ 1805 for (i=n-1; i>=0; i--){ 1806 v = aa + 16*diag[i] + 16; 1807 vi = aj + diag[i] + 1; 1808 nz = ai[i+1] - diag[i] - 1; 1809 idt = 4*i; 1810 s1 = t[idt]; 1811 s2 = t[1+idt]; 1812 s3 = t[2+idt]; 1813 s4 = t[3+idt]; 1814 while (nz--) { 1815 idx = 4*(*vi++); 1816 x1 = t[idx]; 1817 x2 = t[1+idx]; 1818 x3 = t[2+idx]; 1819 x4 = t[3+idx]; 1820 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1821 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1822 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1823 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1824 v += 16; 1825 } 1826 idc = 4*(*c--); 1827 v = aa + 16*diag[i]; 1828 t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1829 t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1830 t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1831 t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1832 x[idc] = (PetscScalar)t[idt]; 1833 x[1+idc] = (PetscScalar)t[1+idt]; 1834 x[2+idc] = (PetscScalar)t[2+idt]; 1835 x[3+idc] = (PetscScalar)t[3+idt]; 1836 } 1837 1838 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1839 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1840 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1841 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1842 PetscLogFlops(2*16*(a->nz) - 4*A->n); 1843 PetscFunctionReturn(0); 1844 } 1845 1846 #if defined (PETSC_HAVE_SSE) 1847 1848 #include PETSC_HAVE_SSE 1849 1850 #undef __FUNCT__ 1851 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion" 1852 int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx) 1853 { 1854 /* 1855 Note: This code uses demotion of double 1856 to float when performing the mixed-mode computation. 1857 This may not be numerically reasonable for all applications. 1858 */ 1859 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1860 IS iscol=a->col,isrow=a->row; 1861 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1862 int *diag = a->diag,ai16; 1863 MatScalar *aa=a->a,*v; 1864 PetscScalar *x,*b,*t; 1865 1866 /* Make space in temp stack for 16 Byte Aligned arrays */ 1867 float ssealignedspace[11],*tmps,*tmpx; 1868 unsigned long offset; 1869 1870 PetscFunctionBegin; 1871 SSE_SCOPE_BEGIN; 1872 1873 offset = (unsigned long)ssealignedspace % 16; 1874 if (offset) offset = (16 - offset)/4; 1875 tmps = &ssealignedspace[offset]; 1876 tmpx = &ssealignedspace[offset+4]; 1877 PREFETCH_NTA(aa+16*ai[1]); 1878 1879 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1880 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1881 t = a->solve_work; 1882 1883 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1884 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1885 1886 /* forward solve the lower triangular */ 1887 idx = 4*(*r++); 1888 t[0] = b[idx]; t[1] = b[1+idx]; 1889 t[2] = b[2+idx]; t[3] = b[3+idx]; 1890 v = aa + 16*ai[1]; 1891 1892 for (i=1; i<n;) { 1893 PREFETCH_NTA(&v[8]); 1894 vi = aj + ai[i]; 1895 nz = diag[i] - ai[i]; 1896 idx = 4*(*r++); 1897 1898 /* Demote sum from double to float */ 1899 CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]); 1900 LOAD_PS(tmps,XMM7); 1901 1902 while (nz--) { 1903 PREFETCH_NTA(&v[16]); 1904 idx = 4*(*vi++); 1905 1906 /* Demote solution (so far) from double to float */ 1907 CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]); 1908 1909 /* 4x4 Matrix-Vector product with negative accumulation: */ 1910 SSE_INLINE_BEGIN_2(tmpx,v) 1911 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1912 1913 /* First Column */ 1914 SSE_COPY_PS(XMM0,XMM6) 1915 SSE_SHUFFLE(XMM0,XMM0,0x00) 1916 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1917 SSE_SUB_PS(XMM7,XMM0) 1918 1919 /* Second Column */ 1920 SSE_COPY_PS(XMM1,XMM6) 1921 SSE_SHUFFLE(XMM1,XMM1,0x55) 1922 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1923 SSE_SUB_PS(XMM7,XMM1) 1924 1925 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1926 1927 /* Third Column */ 1928 SSE_COPY_PS(XMM2,XMM6) 1929 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1930 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1931 SSE_SUB_PS(XMM7,XMM2) 1932 1933 /* Fourth Column */ 1934 SSE_COPY_PS(XMM3,XMM6) 1935 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1936 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1937 SSE_SUB_PS(XMM7,XMM3) 1938 SSE_INLINE_END_2 1939 1940 v += 16; 1941 } 1942 idx = 4*i; 1943 v = aa + 16*ai[++i]; 1944 PREFETCH_NTA(v); 1945 STORE_PS(tmps,XMM7); 1946 1947 /* Promote result from float to double */ 1948 CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps); 1949 } 1950 /* backward solve the upper triangular */ 1951 idt = 4*(n-1); 1952 ai16 = 16*diag[n-1]; 1953 v = aa + ai16 + 16; 1954 for (i=n-1; i>=0;){ 1955 PREFETCH_NTA(&v[8]); 1956 vi = aj + diag[i] + 1; 1957 nz = ai[i+1] - diag[i] - 1; 1958 1959 /* Demote accumulator from double to float */ 1960 CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]); 1961 LOAD_PS(tmps,XMM7); 1962 1963 while (nz--) { 1964 PREFETCH_NTA(&v[16]); 1965 idx = 4*(*vi++); 1966 1967 /* Demote solution (so far) from double to float */ 1968 CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]); 1969 1970 /* 4x4 Matrix-Vector Product with negative accumulation: */ 1971 SSE_INLINE_BEGIN_2(tmpx,v) 1972 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 1973 1974 /* First Column */ 1975 SSE_COPY_PS(XMM0,XMM6) 1976 SSE_SHUFFLE(XMM0,XMM0,0x00) 1977 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 1978 SSE_SUB_PS(XMM7,XMM0) 1979 1980 /* Second Column */ 1981 SSE_COPY_PS(XMM1,XMM6) 1982 SSE_SHUFFLE(XMM1,XMM1,0x55) 1983 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 1984 SSE_SUB_PS(XMM7,XMM1) 1985 1986 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 1987 1988 /* Third Column */ 1989 SSE_COPY_PS(XMM2,XMM6) 1990 SSE_SHUFFLE(XMM2,XMM2,0xAA) 1991 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 1992 SSE_SUB_PS(XMM7,XMM2) 1993 1994 /* Fourth Column */ 1995 SSE_COPY_PS(XMM3,XMM6) 1996 SSE_SHUFFLE(XMM3,XMM3,0xFF) 1997 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 1998 SSE_SUB_PS(XMM7,XMM3) 1999 SSE_INLINE_END_2 2000 v += 16; 2001 } 2002 v = aa + ai16; 2003 ai16 = 16*diag[--i]; 2004 PREFETCH_NTA(aa+ai16+16); 2005 /* 2006 Scale the result by the diagonal 4x4 block, 2007 which was inverted as part of the factorization 2008 */ 2009 SSE_INLINE_BEGIN_3(v,tmps,aa+ai16) 2010 /* First Column */ 2011 SSE_COPY_PS(XMM0,XMM7) 2012 SSE_SHUFFLE(XMM0,XMM0,0x00) 2013 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2014 2015 /* Second Column */ 2016 SSE_COPY_PS(XMM1,XMM7) 2017 SSE_SHUFFLE(XMM1,XMM1,0x55) 2018 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2019 SSE_ADD_PS(XMM0,XMM1) 2020 2021 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2022 2023 /* Third Column */ 2024 SSE_COPY_PS(XMM2,XMM7) 2025 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2026 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2027 SSE_ADD_PS(XMM0,XMM2) 2028 2029 /* Fourth Column */ 2030 SSE_COPY_PS(XMM3,XMM7) 2031 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2032 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2033 SSE_ADD_PS(XMM0,XMM3) 2034 2035 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2036 SSE_INLINE_END_3 2037 2038 /* Promote solution from float to double */ 2039 CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps); 2040 2041 /* Apply reordering to t and stream into x. */ 2042 /* This way, x doesn't pollute the cache. */ 2043 /* Be careful with size: 2 doubles = 4 floats! */ 2044 idc = 4*(*c--); 2045 SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc]) 2046 /* x[idc] = t[idt]; x[1+idc] = t[1+idc]; */ 2047 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0) 2048 SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0) 2049 /* x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */ 2050 SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1) 2051 SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1) 2052 SSE_INLINE_END_2 2053 v = aa + ai16 + 16; 2054 idt -= 4; 2055 } 2056 2057 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2058 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2059 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2060 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2061 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2062 SSE_SCOPE_END; 2063 PetscFunctionReturn(0); 2064 } 2065 2066 #endif 2067 2068 2069 /* 2070 Special case where the matrix was ILU(0) factored in the natural 2071 ordering. This eliminates the need for the column and row permutation. 2072 */ 2073 #undef __FUNCT__ 2074 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering" 2075 int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 2076 { 2077 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2078 int n=a->mbs,*ai=a->i,*aj=a->j; 2079 int ierr,*diag = a->diag; 2080 MatScalar *aa=a->a; 2081 PetscScalar *x,*b; 2082 2083 PetscFunctionBegin; 2084 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2085 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2086 2087 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 2088 { 2089 static PetscScalar w[2000]; /* very BAD need to fix */ 2090 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 2091 } 2092 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 2093 { 2094 static PetscScalar w[2000]; /* very BAD need to fix */ 2095 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 2096 } 2097 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 2098 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 2099 #else 2100 { 2101 PetscScalar s1,s2,s3,s4,x1,x2,x3,x4; 2102 MatScalar *v; 2103 int jdx,idt,idx,nz,*vi,i,ai16; 2104 2105 /* forward solve the lower triangular */ 2106 idx = 0; 2107 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 2108 for (i=1; i<n; i++) { 2109 v = aa + 16*ai[i]; 2110 vi = aj + ai[i]; 2111 nz = diag[i] - ai[i]; 2112 idx += 4; 2113 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 2114 while (nz--) { 2115 jdx = 4*(*vi++); 2116 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 2117 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2118 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2119 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2120 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2121 v += 16; 2122 } 2123 x[idx] = s1; 2124 x[1+idx] = s2; 2125 x[2+idx] = s3; 2126 x[3+idx] = s4; 2127 } 2128 /* backward solve the upper triangular */ 2129 idt = 4*(n-1); 2130 for (i=n-1; i>=0; i--){ 2131 ai16 = 16*diag[i]; 2132 v = aa + ai16 + 16; 2133 vi = aj + diag[i] + 1; 2134 nz = ai[i+1] - diag[i] - 1; 2135 s1 = x[idt]; s2 = x[1+idt]; 2136 s3 = x[2+idt];s4 = x[3+idt]; 2137 while (nz--) { 2138 idx = 4*(*vi++); 2139 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 2140 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2141 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2142 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2143 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2144 v += 16; 2145 } 2146 v = aa + ai16; 2147 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 2148 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 2149 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 2150 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 2151 idt -= 4; 2152 } 2153 } 2154 #endif 2155 2156 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2157 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2158 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2159 PetscFunctionReturn(0); 2160 } 2161 2162 #undef __FUNCT__ 2163 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion" 2164 int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx) 2165 { 2166 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2167 int n=a->mbs,*ai=a->i,*aj=a->j; 2168 int ierr,*diag = a->diag; 2169 MatScalar *aa=a->a; 2170 PetscScalar *x,*b; 2171 2172 PetscFunctionBegin; 2173 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2174 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2175 2176 { 2177 MatScalar s1,s2,s3,s4,x1,x2,x3,x4; 2178 MatScalar *v,*t=(MatScalar *)x; 2179 int jdx,idt,idx,nz,*vi,i,ai16; 2180 2181 /* forward solve the lower triangular */ 2182 idx = 0; 2183 t[0] = (MatScalar)b[0]; 2184 t[1] = (MatScalar)b[1]; 2185 t[2] = (MatScalar)b[2]; 2186 t[3] = (MatScalar)b[3]; 2187 for (i=1; i<n; i++) { 2188 v = aa + 16*ai[i]; 2189 vi = aj + ai[i]; 2190 nz = diag[i] - ai[i]; 2191 idx += 4; 2192 s1 = (MatScalar)b[idx]; 2193 s2 = (MatScalar)b[1+idx]; 2194 s3 = (MatScalar)b[2+idx]; 2195 s4 = (MatScalar)b[3+idx]; 2196 while (nz--) { 2197 jdx = 4*(*vi++); 2198 x1 = t[jdx]; 2199 x2 = t[1+jdx]; 2200 x3 = t[2+jdx]; 2201 x4 = t[3+jdx]; 2202 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2203 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2204 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2205 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2206 v += 16; 2207 } 2208 t[idx] = s1; 2209 t[1+idx] = s2; 2210 t[2+idx] = s3; 2211 t[3+idx] = s4; 2212 } 2213 /* backward solve the upper triangular */ 2214 idt = 4*(n-1); 2215 for (i=n-1; i>=0; i--){ 2216 ai16 = 16*diag[i]; 2217 v = aa + ai16 + 16; 2218 vi = aj + diag[i] + 1; 2219 nz = ai[i+1] - diag[i] - 1; 2220 s1 = t[idt]; 2221 s2 = t[1+idt]; 2222 s3 = t[2+idt]; 2223 s4 = t[3+idt]; 2224 while (nz--) { 2225 idx = 4*(*vi++); 2226 x1 = (MatScalar)x[idx]; 2227 x2 = (MatScalar)x[1+idx]; 2228 x3 = (MatScalar)x[2+idx]; 2229 x4 = (MatScalar)x[3+idx]; 2230 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 2231 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 2232 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 2233 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 2234 v += 16; 2235 } 2236 v = aa + ai16; 2237 x[idt] = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4); 2238 x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4); 2239 x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4); 2240 x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4); 2241 idt -= 4; 2242 } 2243 } 2244 2245 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2246 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2247 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2248 PetscFunctionReturn(0); 2249 } 2250 2251 #if defined (PETSC_HAVE_SSE) 2252 2253 #include PETSC_HAVE_SSE 2254 #include "src/vec/vecimpl.h" /* to allow VecGetArrayFast() */ 2255 #undef __FUNCT__ 2256 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion" 2257 int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx) 2258 { 2259 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2260 int n=a->mbs,*ai=a->i,*aj=a->j; 2261 int ierr,*diag = a->diag; 2262 MatScalar *aa=a->a; 2263 PetscScalar *x,*b; 2264 2265 PetscFunctionBegin; 2266 SSE_SCOPE_BEGIN; 2267 /* 2268 Note: This code currently uses demotion of double 2269 to float when performing the mixed-mode computation. 2270 This may not be numerically reasonable for all applications. 2271 */ 2272 PREFETCH_NTA(aa+16*ai[1]); 2273 2274 ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr); 2275 ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr); 2276 { 2277 /* x will first be computed in single precision then promoted inplace to double */ 2278 MatScalar *v,*t=(MatScalar *)x; 2279 int jdx,idt,idx,nz,*vi,i,ai16; 2280 2281 /* Forward solve the lower triangular factor. */ 2282 2283 /* First block is the identity. */ 2284 idx = 0; 2285 CONVERT_DOUBLE4_FLOAT4(t,b); 2286 v = aa + 16*ai[1]; 2287 2288 for (i=1; i<n;) { 2289 PREFETCH_NTA(&v[8]); 2290 vi = aj + ai[i]; 2291 nz = diag[i] - ai[i]; 2292 idx += 4; 2293 2294 /* Demote RHS from double to float. */ 2295 CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]); 2296 LOAD_PS(&t[idx],XMM7); 2297 2298 while (nz--) { 2299 PREFETCH_NTA(&v[16]); 2300 jdx = 4*(*vi++); 2301 2302 /* 4x4 Matrix-Vector product with negative accumulation: */ 2303 SSE_INLINE_BEGIN_2(&t[jdx],v) 2304 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2305 2306 /* First Column */ 2307 SSE_COPY_PS(XMM0,XMM6) 2308 SSE_SHUFFLE(XMM0,XMM0,0x00) 2309 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2310 SSE_SUB_PS(XMM7,XMM0) 2311 2312 /* Second Column */ 2313 SSE_COPY_PS(XMM1,XMM6) 2314 SSE_SHUFFLE(XMM1,XMM1,0x55) 2315 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2316 SSE_SUB_PS(XMM7,XMM1) 2317 2318 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2319 2320 /* Third Column */ 2321 SSE_COPY_PS(XMM2,XMM6) 2322 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2323 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2324 SSE_SUB_PS(XMM7,XMM2) 2325 2326 /* Fourth Column */ 2327 SSE_COPY_PS(XMM3,XMM6) 2328 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2329 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2330 SSE_SUB_PS(XMM7,XMM3) 2331 SSE_INLINE_END_2 2332 2333 v += 16; 2334 } 2335 v = aa + 16*ai[++i]; 2336 PREFETCH_NTA(v); 2337 STORE_PS(&t[idx],XMM7); 2338 } 2339 2340 /* Backward solve the upper triangular factor.*/ 2341 2342 idt = 4*(n-1); 2343 ai16 = 16*diag[n-1]; 2344 v = aa + ai16 + 16; 2345 for (i=n-1; i>=0;){ 2346 PREFETCH_NTA(&v[8]); 2347 vi = aj + diag[i] + 1; 2348 nz = ai[i+1] - diag[i] - 1; 2349 2350 LOAD_PS(&t[idt],XMM7); 2351 2352 while (nz--) { 2353 PREFETCH_NTA(&v[16]); 2354 idx = 4*(*vi++); 2355 2356 /* 4x4 Matrix-Vector Product with negative accumulation: */ 2357 SSE_INLINE_BEGIN_2(&t[idx],v) 2358 SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6) 2359 2360 /* First Column */ 2361 SSE_COPY_PS(XMM0,XMM6) 2362 SSE_SHUFFLE(XMM0,XMM0,0x00) 2363 SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0) 2364 SSE_SUB_PS(XMM7,XMM0) 2365 2366 /* Second Column */ 2367 SSE_COPY_PS(XMM1,XMM6) 2368 SSE_SHUFFLE(XMM1,XMM1,0x55) 2369 SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4) 2370 SSE_SUB_PS(XMM7,XMM1) 2371 2372 SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24) 2373 2374 /* Third Column */ 2375 SSE_COPY_PS(XMM2,XMM6) 2376 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2377 SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8) 2378 SSE_SUB_PS(XMM7,XMM2) 2379 2380 /* Fourth Column */ 2381 SSE_COPY_PS(XMM3,XMM6) 2382 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2383 SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12) 2384 SSE_SUB_PS(XMM7,XMM3) 2385 SSE_INLINE_END_2 2386 v += 16; 2387 } 2388 v = aa + ai16; 2389 ai16 = 16*diag[--i]; 2390 PREFETCH_NTA(aa+ai16+16); 2391 /* 2392 Scale the result by the diagonal 4x4 block, 2393 which was inverted as part of the factorization 2394 */ 2395 SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16) 2396 /* First Column */ 2397 SSE_COPY_PS(XMM0,XMM7) 2398 SSE_SHUFFLE(XMM0,XMM0,0x00) 2399 SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0) 2400 2401 /* Second Column */ 2402 SSE_COPY_PS(XMM1,XMM7) 2403 SSE_SHUFFLE(XMM1,XMM1,0x55) 2404 SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4) 2405 SSE_ADD_PS(XMM0,XMM1) 2406 2407 SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24) 2408 2409 /* Third Column */ 2410 SSE_COPY_PS(XMM2,XMM7) 2411 SSE_SHUFFLE(XMM2,XMM2,0xAA) 2412 SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8) 2413 SSE_ADD_PS(XMM0,XMM2) 2414 2415 /* Fourth Column */ 2416 SSE_COPY_PS(XMM3,XMM7) 2417 SSE_SHUFFLE(XMM3,XMM3,0xFF) 2418 SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12) 2419 SSE_ADD_PS(XMM0,XMM3) 2420 2421 SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0) 2422 SSE_INLINE_END_3 2423 2424 v = aa + ai16 + 16; 2425 idt -= 4; 2426 } 2427 2428 /* Convert t from single precision back to double precision (inplace)*/ 2429 idt = 4*(n-1); 2430 for (i=n-1;i>=0;i--) { 2431 /* CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */ 2432 /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */ 2433 PetscScalar *xtemp=&x[idt]; 2434 MatScalar *ttemp=&t[idt]; 2435 xtemp[3] = (PetscScalar)ttemp[3]; 2436 xtemp[2] = (PetscScalar)ttemp[2]; 2437 xtemp[1] = (PetscScalar)ttemp[1]; 2438 xtemp[0] = (PetscScalar)ttemp[0]; 2439 } 2440 2441 } /* End of artificial scope. */ 2442 ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr); 2443 ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr); 2444 PetscLogFlops(2*16*(a->nz) - 4*A->n); 2445 SSE_SCOPE_END; 2446 PetscFunctionReturn(0); 2447 } 2448 2449 #endif 2450 #undef __FUNCT__ 2451 #define __FUNCT__ "MatSolve_SeqBAIJ_3" 2452 int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx) 2453 { 2454 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2455 IS iscol=a->col,isrow=a->row; 2456 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2457 int *diag = a->diag; 2458 MatScalar *aa=a->a,*v; 2459 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 2460 2461 PetscFunctionBegin; 2462 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2463 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2464 t = a->solve_work; 2465 2466 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2467 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2468 2469 /* forward solve the lower triangular */ 2470 idx = 3*(*r++); 2471 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 2472 for (i=1; i<n; i++) { 2473 v = aa + 9*ai[i]; 2474 vi = aj + ai[i]; 2475 nz = diag[i] - ai[i]; 2476 idx = 3*(*r++); 2477 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 2478 while (nz--) { 2479 idx = 3*(*vi++); 2480 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2481 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2482 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2483 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2484 v += 9; 2485 } 2486 idx = 3*i; 2487 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 2488 } 2489 /* backward solve the upper triangular */ 2490 for (i=n-1; i>=0; i--){ 2491 v = aa + 9*diag[i] + 9; 2492 vi = aj + diag[i] + 1; 2493 nz = ai[i+1] - diag[i] - 1; 2494 idt = 3*i; 2495 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 2496 while (nz--) { 2497 idx = 3*(*vi++); 2498 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 2499 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2500 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2501 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2502 v += 9; 2503 } 2504 idc = 3*(*c--); 2505 v = aa + 9*diag[i]; 2506 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2507 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2508 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2509 } 2510 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2511 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2512 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2513 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2514 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2515 PetscFunctionReturn(0); 2516 } 2517 2518 /* 2519 Special case where the matrix was ILU(0) factored in the natural 2520 ordering. This eliminates the need for the column and row permutation. 2521 */ 2522 #undef __FUNCT__ 2523 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering" 2524 int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 2525 { 2526 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2527 int n=a->mbs,*ai=a->i,*aj=a->j; 2528 int ierr,*diag = a->diag; 2529 MatScalar *aa=a->a,*v; 2530 PetscScalar *x,*b,s1,s2,s3,x1,x2,x3; 2531 int jdx,idt,idx,nz,*vi,i; 2532 2533 PetscFunctionBegin; 2534 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2535 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2536 2537 2538 /* forward solve the lower triangular */ 2539 idx = 0; 2540 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 2541 for (i=1; i<n; i++) { 2542 v = aa + 9*ai[i]; 2543 vi = aj + ai[i]; 2544 nz = diag[i] - ai[i]; 2545 idx += 3; 2546 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 2547 while (nz--) { 2548 jdx = 3*(*vi++); 2549 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 2550 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2551 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2552 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2553 v += 9; 2554 } 2555 x[idx] = s1; 2556 x[1+idx] = s2; 2557 x[2+idx] = s3; 2558 } 2559 /* backward solve the upper triangular */ 2560 for (i=n-1; i>=0; i--){ 2561 v = aa + 9*diag[i] + 9; 2562 vi = aj + diag[i] + 1; 2563 nz = ai[i+1] - diag[i] - 1; 2564 idt = 3*i; 2565 s1 = x[idt]; s2 = x[1+idt]; 2566 s3 = x[2+idt]; 2567 while (nz--) { 2568 idx = 3*(*vi++); 2569 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 2570 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 2571 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 2572 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 2573 v += 9; 2574 } 2575 v = aa + 9*diag[i]; 2576 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 2577 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 2578 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 2579 } 2580 2581 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2582 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2583 PetscLogFlops(2*9*(a->nz) - 3*A->n); 2584 PetscFunctionReturn(0); 2585 } 2586 2587 #undef __FUNCT__ 2588 #define __FUNCT__ "MatSolve_SeqBAIJ_2" 2589 int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx) 2590 { 2591 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2592 IS iscol=a->col,isrow=a->row; 2593 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 2594 int *diag = a->diag; 2595 MatScalar *aa=a->a,*v; 2596 PetscScalar *x,*b,s1,s2,x1,x2,*t; 2597 2598 PetscFunctionBegin; 2599 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2600 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2601 t = a->solve_work; 2602 2603 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2604 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2605 2606 /* forward solve the lower triangular */ 2607 idx = 2*(*r++); 2608 t[0] = b[idx]; t[1] = b[1+idx]; 2609 for (i=1; i<n; i++) { 2610 v = aa + 4*ai[i]; 2611 vi = aj + ai[i]; 2612 nz = diag[i] - ai[i]; 2613 idx = 2*(*r++); 2614 s1 = b[idx]; s2 = b[1+idx]; 2615 while (nz--) { 2616 idx = 2*(*vi++); 2617 x1 = t[idx]; x2 = t[1+idx]; 2618 s1 -= v[0]*x1 + v[2]*x2; 2619 s2 -= v[1]*x1 + v[3]*x2; 2620 v += 4; 2621 } 2622 idx = 2*i; 2623 t[idx] = s1; t[1+idx] = s2; 2624 } 2625 /* backward solve the upper triangular */ 2626 for (i=n-1; i>=0; i--){ 2627 v = aa + 4*diag[i] + 4; 2628 vi = aj + diag[i] + 1; 2629 nz = ai[i+1] - diag[i] - 1; 2630 idt = 2*i; 2631 s1 = t[idt]; s2 = t[1+idt]; 2632 while (nz--) { 2633 idx = 2*(*vi++); 2634 x1 = t[idx]; x2 = t[1+idx]; 2635 s1 -= v[0]*x1 + v[2]*x2; 2636 s2 -= v[1]*x1 + v[3]*x2; 2637 v += 4; 2638 } 2639 idc = 2*(*c--); 2640 v = aa + 4*diag[i]; 2641 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2642 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2643 } 2644 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2645 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2646 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2647 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2648 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2649 PetscFunctionReturn(0); 2650 } 2651 2652 /* 2653 Special case where the matrix was ILU(0) factored in the natural 2654 ordering. This eliminates the need for the column and row permutation. 2655 */ 2656 #undef __FUNCT__ 2657 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering" 2658 int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2659 { 2660 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2661 int n=a->mbs,*ai=a->i,*aj=a->j; 2662 int ierr,*diag = a->diag; 2663 MatScalar *aa=a->a,*v; 2664 PetscScalar *x,*b,s1,s2,x1,x2; 2665 int jdx,idt,idx,nz,*vi,i; 2666 2667 PetscFunctionBegin; 2668 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2669 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2670 2671 /* forward solve the lower triangular */ 2672 idx = 0; 2673 x[0] = b[0]; x[1] = b[1]; 2674 for (i=1; i<n; i++) { 2675 v = aa + 4*ai[i]; 2676 vi = aj + ai[i]; 2677 nz = diag[i] - ai[i]; 2678 idx += 2; 2679 s1 = b[idx];s2 = b[1+idx]; 2680 while (nz--) { 2681 jdx = 2*(*vi++); 2682 x1 = x[jdx];x2 = x[1+jdx]; 2683 s1 -= v[0]*x1 + v[2]*x2; 2684 s2 -= v[1]*x1 + v[3]*x2; 2685 v += 4; 2686 } 2687 x[idx] = s1; 2688 x[1+idx] = s2; 2689 } 2690 /* backward solve the upper triangular */ 2691 for (i=n-1; i>=0; i--){ 2692 v = aa + 4*diag[i] + 4; 2693 vi = aj + diag[i] + 1; 2694 nz = ai[i+1] - diag[i] - 1; 2695 idt = 2*i; 2696 s1 = x[idt]; s2 = x[1+idt]; 2697 while (nz--) { 2698 idx = 2*(*vi++); 2699 x1 = x[idx]; x2 = x[1+idx]; 2700 s1 -= v[0]*x1 + v[2]*x2; 2701 s2 -= v[1]*x1 + v[3]*x2; 2702 v += 4; 2703 } 2704 v = aa + 4*diag[i]; 2705 x[idt] = v[0]*s1 + v[2]*s2; 2706 x[1+idt] = v[1]*s1 + v[3]*s2; 2707 } 2708 2709 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2710 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2711 PetscLogFlops(2*4*(a->nz) - 2*A->n); 2712 PetscFunctionReturn(0); 2713 } 2714 2715 #undef __FUNCT__ 2716 #define __FUNCT__ "MatSolve_SeqBAIJ_1" 2717 int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx) 2718 { 2719 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2720 IS iscol=a->col,isrow=a->row; 2721 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2722 int *diag = a->diag; 2723 MatScalar *aa=a->a,*v; 2724 PetscScalar *x,*b,s1,*t; 2725 2726 PetscFunctionBegin; 2727 if (!n) PetscFunctionReturn(0); 2728 2729 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2730 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2731 t = a->solve_work; 2732 2733 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2734 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2735 2736 /* forward solve the lower triangular */ 2737 t[0] = b[*r++]; 2738 for (i=1; i<n; i++) { 2739 v = aa + ai[i]; 2740 vi = aj + ai[i]; 2741 nz = diag[i] - ai[i]; 2742 s1 = b[*r++]; 2743 while (nz--) { 2744 s1 -= (*v++)*t[*vi++]; 2745 } 2746 t[i] = s1; 2747 } 2748 /* backward solve the upper triangular */ 2749 for (i=n-1; i>=0; i--){ 2750 v = aa + diag[i] + 1; 2751 vi = aj + diag[i] + 1; 2752 nz = ai[i+1] - diag[i] - 1; 2753 s1 = t[i]; 2754 while (nz--) { 2755 s1 -= (*v++)*t[*vi++]; 2756 } 2757 x[*c--] = t[i] = aa[diag[i]]*s1; 2758 } 2759 2760 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2761 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2762 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2763 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2764 PetscLogFlops(2*1*(a->nz) - A->n); 2765 PetscFunctionReturn(0); 2766 } 2767 /* 2768 Special case where the matrix was ILU(0) factored in the natural 2769 ordering. This eliminates the need for the column and row permutation. 2770 */ 2771 #undef __FUNCT__ 2772 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering" 2773 int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2774 { 2775 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2776 int n=a->mbs,*ai=a->i,*aj=a->j; 2777 int ierr,*diag = a->diag; 2778 MatScalar *aa=a->a; 2779 PetscScalar *x,*b; 2780 PetscScalar s1,x1; 2781 MatScalar *v; 2782 int jdx,idt,idx,nz,*vi,i; 2783 2784 PetscFunctionBegin; 2785 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2786 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2787 2788 /* forward solve the lower triangular */ 2789 idx = 0; 2790 x[0] = b[0]; 2791 for (i=1; i<n; i++) { 2792 v = aa + ai[i]; 2793 vi = aj + ai[i]; 2794 nz = diag[i] - ai[i]; 2795 idx += 1; 2796 s1 = b[idx]; 2797 while (nz--) { 2798 jdx = *vi++; 2799 x1 = x[jdx]; 2800 s1 -= v[0]*x1; 2801 v += 1; 2802 } 2803 x[idx] = s1; 2804 } 2805 /* backward solve the upper triangular */ 2806 for (i=n-1; i>=0; i--){ 2807 v = aa + diag[i] + 1; 2808 vi = aj + diag[i] + 1; 2809 nz = ai[i+1] - diag[i] - 1; 2810 idt = i; 2811 s1 = x[idt]; 2812 while (nz--) { 2813 idx = *vi++; 2814 x1 = x[idx]; 2815 s1 -= v[0]*x1; 2816 v += 1; 2817 } 2818 v = aa + diag[i]; 2819 x[idt] = v[0]*s1; 2820 } 2821 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2822 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2823 PetscLogFlops(2*(a->nz) - A->n); 2824 PetscFunctionReturn(0); 2825 } 2826 2827 /* ----------------------------------------------------------------*/ 2828 /* 2829 This code is virtually identical to MatILUFactorSymbolic_SeqAIJ 2830 except that the data structure of Mat_SeqAIJ is slightly different. 2831 Not a good example of code reuse. 2832 */ 2833 EXTERN int MatMissingDiagonal_SeqBAIJ(Mat); 2834 2835 #undef __FUNCT__ 2836 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ" 2837 int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact) 2838 { 2839 Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b; 2840 IS isicol; 2841 int *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j; 2842 int *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev; 2843 int *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0; 2844 int incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill; 2845 PetscTruth col_identity,row_identity; 2846 PetscReal f; 2847 2848 PetscFunctionBegin; 2849 f = info->fill; 2850 levels = (int)info->levels; 2851 diagonal_fill = (int)info->diagonal_fill; 2852 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 2853 ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr); 2854 ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr); 2855 2856 if (!levels && row_identity && col_identity) { /* special case copy the nonzero structure */ 2857 ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr); 2858 (*fact)->factor = FACTOR_LU; 2859 b = (Mat_SeqBAIJ*)(*fact)->data; 2860 if (!b->diag) { 2861 ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 2862 } 2863 ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr); 2864 b->row = isrow; 2865 b->col = iscol; 2866 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 2867 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 2868 b->icol = isicol; 2869 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 2870 ierr = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 2871 } else { /* general case perform the symbolic factorization */ 2872 ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr); 2873 ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr); 2874 2875 /* get new row pointers */ 2876 ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr); 2877 ainew[0] = 0; 2878 /* don't know how many column pointers are needed so estimate */ 2879 jmax = (int)(f*ai[n] + 1); 2880 ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr); 2881 /* ajfill is level of fill for each fill entry */ 2882 ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr); 2883 /* fill is a linked list of nonzeros in active row */ 2884 ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr); 2885 /* im is level for each filled value */ 2886 ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr); 2887 /* dloc is location of diagonal in factor */ 2888 ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr); 2889 dloc[0] = 0; 2890 for (prow=0; prow<n; prow++) { 2891 2892 /* copy prow into linked list */ 2893 nzf = nz = ai[r[prow]+1] - ai[r[prow]]; 2894 if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix"); 2895 xi = aj + ai[r[prow]]; 2896 fill[n] = n; 2897 fill[prow] = -1; /* marker for diagonal entry */ 2898 while (nz--) { 2899 fm = n; 2900 idx = ic[*xi++]; 2901 do { 2902 m = fm; 2903 fm = fill[m]; 2904 } while (fm < idx); 2905 fill[m] = idx; 2906 fill[idx] = fm; 2907 im[idx] = 0; 2908 } 2909 2910 /* make sure diagonal entry is included */ 2911 if (diagonal_fill && fill[prow] == -1) { 2912 fm = n; 2913 while (fill[fm] < prow) fm = fill[fm]; 2914 fill[prow] = fill[fm]; /* insert diagonal into linked list */ 2915 fill[fm] = prow; 2916 im[prow] = 0; 2917 nzf++; 2918 dcount++; 2919 } 2920 2921 nzi = 0; 2922 row = fill[n]; 2923 while (row < prow) { 2924 incrlev = im[row] + 1; 2925 nz = dloc[row]; 2926 xi = ajnew + ainew[row] + nz + 1; 2927 flev = ajfill + ainew[row] + nz + 1; 2928 nnz = ainew[row+1] - ainew[row] - nz - 1; 2929 fm = row; 2930 while (nnz-- > 0) { 2931 idx = *xi++; 2932 if (*flev + incrlev > levels) { 2933 flev++; 2934 continue; 2935 } 2936 do { 2937 m = fm; 2938 fm = fill[m]; 2939 } while (fm < idx); 2940 if (fm != idx) { 2941 im[idx] = *flev + incrlev; 2942 fill[m] = idx; 2943 fill[idx] = fm; 2944 fm = idx; 2945 nzf++; 2946 } else { 2947 if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev; 2948 } 2949 flev++; 2950 } 2951 row = fill[row]; 2952 nzi++; 2953 } 2954 /* copy new filled row into permanent storage */ 2955 ainew[prow+1] = ainew[prow] + nzf; 2956 if (ainew[prow+1] > jmax) { 2957 2958 /* estimate how much additional space we will need */ 2959 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2960 /* just double the memory each time */ 2961 int maxadd = jmax; 2962 /* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */ 2963 if (maxadd < nzf) maxadd = (n-prow)*(nzf+1); 2964 jmax += maxadd; 2965 2966 /* allocate a longer ajnew and ajfill */ 2967 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 2968 ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr); 2969 ierr = PetscFree(ajnew);CHKERRQ(ierr); 2970 ajnew = xi; 2971 ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr); 2972 ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr); 2973 ierr = PetscFree(ajfill);CHKERRQ(ierr); 2974 ajfill = xi; 2975 reallocate++; /* count how many reallocations are needed */ 2976 } 2977 xi = ajnew + ainew[prow]; 2978 flev = ajfill + ainew[prow]; 2979 dloc[prow] = nzi; 2980 fm = fill[n]; 2981 while (nzf--) { 2982 *xi++ = fm; 2983 *flev++ = im[fm]; 2984 fm = fill[fm]; 2985 } 2986 /* make sure row has diagonal entry */ 2987 if (ajnew[ainew[prow]+dloc[prow]] != prow) { 2988 SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\ 2989 try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow); 2990 } 2991 } 2992 ierr = PetscFree(ajfill);CHKERRQ(ierr); 2993 ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr); 2994 ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr); 2995 ierr = PetscFree(fill);CHKERRQ(ierr); 2996 ierr = PetscFree(im);CHKERRQ(ierr); 2997 2998 { 2999 PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]); 3000 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af); 3001 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af); 3002 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af); 3003 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n"); 3004 if (diagonal_fill) { 3005 PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount); 3006 } 3007 } 3008 3009 /* put together the new matrix */ 3010 ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr); 3011 PetscLogObjectParent(*fact,isicol); 3012 b = (Mat_SeqBAIJ*)(*fact)->data; 3013 ierr = PetscFree(b->imax);CHKERRQ(ierr); 3014 b->singlemalloc = PETSC_FALSE; 3015 len = bs2*ainew[n]*sizeof(MatScalar); 3016 /* the next line frees the default space generated by the Create() */ 3017 ierr = PetscFree(b->a);CHKERRQ(ierr); 3018 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 3019 ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr); 3020 b->j = ajnew; 3021 b->i = ainew; 3022 for (i=0; i<n; i++) dloc[i] += ainew[i]; 3023 b->diag = dloc; 3024 b->ilen = 0; 3025 b->imax = 0; 3026 b->row = isrow; 3027 b->col = iscol; 3028 b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE; 3029 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 3030 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 3031 b->icol = isicol; 3032 ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr); 3033 /* In b structure: Free imax, ilen, old a, old j. 3034 Allocate dloc, solve_work, new a, new j */ 3035 PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar)); 3036 b->maxnz = b->nz = ainew[n]; 3037 (*fact)->factor = FACTOR_LU; 3038 3039 (*fact)->info.factor_mallocs = reallocate; 3040 (*fact)->info.fill_ratio_given = f; 3041 (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]); 3042 } 3043 3044 if (row_identity && col_identity) { 3045 ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr); 3046 } 3047 PetscFunctionReturn(0); 3048 } 3049 3050 #undef __FUNCT__ 3051 #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering" 3052 int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA) 3053 { 3054 /* 3055 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3056 with natural ordering 3057 */ 3058 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data; 3059 int ierr; 3060 3061 PetscFunctionBegin; 3062 inA->ops->solve = MatSolve_SeqBAIJ_Update; 3063 inA->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_Update; 3064 switch (a->bs) { 3065 case 1: 3066 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1; 3067 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n"); 3068 break; 3069 case 2: 3070 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering; 3071 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n"); 3072 break; 3073 case 3: 3074 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering; 3075 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n"); 3076 break; 3077 case 4: 3078 #if defined(PETSC_USE_MAT_SINGLE) 3079 { 3080 PetscTruth sse_enabled_local; 3081 ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3082 if (sse_enabled_local) { 3083 # if defined(PETSC_HAVE_SSE) 3084 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE; 3085 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4\n"); 3086 # else 3087 /* This should never be reached. If so, problem in PetscSSEIsEnabled. */ 3088 SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable"); 3089 # endif 3090 } else { 3091 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3092 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3093 } 3094 } 3095 #else 3096 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering; 3097 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n"); 3098 #endif 3099 break; 3100 case 5: 3101 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering; 3102 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n"); 3103 break; 3104 case 6: 3105 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering; 3106 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n"); 3107 break; 3108 case 7: 3109 inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering; 3110 PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n"); 3111 break; 3112 } 3113 PetscFunctionReturn(0); 3114 } 3115 3116 #undef __FUNCT__ 3117 #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers" 3118 int MatSeqBAIJ_UpdateSolvers(Mat A) 3119 { 3120 /* 3121 Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver 3122 with natural ordering 3123 */ 3124 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 3125 IS row = a->row, col = a->col; 3126 PetscTruth row_identity, col_identity; 3127 PetscTruth use_natural; 3128 int ierr; 3129 3130 PetscFunctionBegin; 3131 3132 use_natural = PETSC_FALSE; 3133 3134 ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr); 3135 ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr); 3136 3137 if (row_identity && col_identity) { 3138 use_natural = PETSC_TRUE; 3139 } else { 3140 use_natural = PETSC_FALSE; 3141 } 3142 switch (a->bs) { 3143 case 1: 3144 if (use_natural) { 3145 A->ops->solve = MatSolve_SeqBAIJ_1_NaturalOrdering; 3146 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering; 3147 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n"); 3148 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3149 } else { 3150 A->ops->solve = MatSolve_SeqBAIJ_1; 3151 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_1; 3152 } 3153 break; 3154 case 2: 3155 if (use_natural) { 3156 A->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering; 3157 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering; 3158 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n"); 3159 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3160 } else { 3161 A->ops->solve = MatSolve_SeqBAIJ_2; 3162 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_2; 3163 } 3164 break; 3165 case 3: 3166 if (use_natural) { 3167 A->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering; 3168 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering; 3169 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n"); 3170 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n"); 3171 } else { 3172 A->ops->solve = MatSolve_SeqBAIJ_3; 3173 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_3; 3174 } 3175 break; 3176 case 4: 3177 { 3178 PetscTruth sse_enabled_local; 3179 ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr); 3180 if (use_natural) { 3181 #if defined(PETSC_USE_MAT_SINGLE) 3182 if (sse_enabled_local) { /* Natural + Single + SSE */ 3183 # if defined(PETSC_HAVE_SSE) 3184 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion; 3185 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, natural ordering solve BS=4\n"); 3186 # else 3187 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3188 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3189 # endif 3190 } else { /* Natural + Single */ 3191 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion; 3192 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n"); 3193 } 3194 #else 3195 A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering; 3196 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3197 #endif 3198 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering; 3199 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n"); 3200 } else { /* Arbitrary ordering */ 3201 #if defined(PETSC_USE_MAT_SINGLE) 3202 if (sse_enabled_local) { /* Arbitrary + Single + SSE */ 3203 # if defined(PETSC_HAVE_SSE) 3204 A->ops->solve = MatSolve_SeqBAIJ_4_SSE_Demotion; 3205 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n"); 3206 # else 3207 /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */ 3208 SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable."); 3209 # endif 3210 } else { /* Arbitrary + Single */ 3211 A->ops->solve = MatSolve_SeqBAIJ_4_Demotion; 3212 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n"); 3213 } 3214 #else 3215 A->ops->solve = MatSolve_SeqBAIJ_4; 3216 #endif 3217 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_4; 3218 } 3219 } 3220 break; 3221 case 5: 3222 if (use_natural) { 3223 A->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering; 3224 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering; 3225 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3226 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n"); 3227 } else { 3228 A->ops->solve = MatSolve_SeqBAIJ_5; 3229 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_5; 3230 } 3231 break; 3232 case 6: 3233 if (use_natural) { 3234 A->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering; 3235 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering; 3236 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3237 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n"); 3238 } else { 3239 A->ops->solve = MatSolve_SeqBAIJ_6; 3240 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_6; 3241 } 3242 break; 3243 case 7: 3244 if (use_natural) { 3245 A->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering; 3246 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering; 3247 PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3248 PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n"); 3249 } else { 3250 A->ops->solve = MatSolve_SeqBAIJ_7; 3251 A->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_7; 3252 } 3253 break; 3254 default: 3255 A->ops->solve = MatSolve_SeqBAIJ_N; 3256 break; 3257 } 3258 PetscFunctionReturn(0); 3259 } 3260 3261 #undef __FUNCT__ 3262 #define __FUNCT__ "MatSolve_SeqBAIJ_Update" 3263 int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3264 int ierr; 3265 3266 PetscFunctionBegin; 3267 ierr = MatSeqBAIJ_UpdateSolvers(A); 3268 if (A->ops->solve != MatSolve_SeqBAIJ_Update) { 3269 ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr); 3270 } else { 3271 SETERRQ(PETSC_ERR_SUP,"Something really wrong happened."); 3272 } 3273 PetscFunctionReturn(0); 3274 } 3275 3276 #undef __FUNCT__ 3277 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update" 3278 int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) { 3279 int ierr; 3280 3281 PetscFunctionBegin; 3282 ierr = MatSeqBAIJ_UpdateSolvers(A); 3283 ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr); 3284 PetscFunctionReturn(0); 3285 } 3286 3287 3288