1 /*$Id: sbaijfact2.c,v 1.1 2000/06/21 15:47:02 balay Exp balay $*/ 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include "sbaij.h" 7 #include "src/mat/impls/baij/seq/baij.h" 8 #include "src/vec/vecimpl.h" 9 #include "src/inline/ilu.h" 10 #include "src/inline/dot.h" 11 12 #undef __FUNC__ 13 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering" 14 int MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 15 { 16 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 17 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18 int *diag = a->diag; 19 MatScalar *aa=a->a,*v; 20 Scalar s1,*x,*b; 21 22 PetscFunctionBegin; 23 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 24 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25 26 /* forward solve the U^T */ 27 for (i=0; i<n; i++) { 28 29 v = aa + diag[i]; 30 /* multiply by the inverse of the block diagonal */ 31 s1 = (*v++)*b[i]; 32 vi = aj + diag[i] + 1; 33 nz = ai[i+1] - diag[i] - 1; 34 while (nz--) { 35 x[*vi++] -= (*v++)*s1; 36 } 37 x[i] = s1; 38 } 39 /* backward solve the L^T */ 40 for (i=n-1; i>=0; i--){ 41 v = aa + diag[i] - 1; 42 vi = aj + diag[i] - 1; 43 nz = diag[i] - ai[i]; 44 s1 = x[i]; 45 while (nz--) { 46 x[*vi--] -= (*v--)*s1; 47 } 48 } 49 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 51 PLogFlops(2*(a->nz) - a->n); 52 PetscFunctionReturn(0); 53 } 54 55 #undef __FUNC__ 56 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering" 57 int MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 58 { 59 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 60 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 61 int *diag = a->diag,oidx; 62 MatScalar *aa=a->a,*v; 63 Scalar s1,s2,x1,x2; 64 Scalar *x,*b; 65 66 PetscFunctionBegin; 67 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 68 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 69 70 /* forward solve the U^T */ 71 idx = 0; 72 for (i=0; i<n; i++) { 73 74 v = aa + 4*diag[i]; 75 /* multiply by the inverse of the block diagonal */ 76 x1 = b[idx]; x2 = b[1+idx]; 77 s1 = v[0]*x1 + v[1]*x2; 78 s2 = v[2]*x1 + v[3]*x2; 79 v += 4; 80 81 vi = aj + diag[i] + 1; 82 nz = ai[i+1] - diag[i] - 1; 83 while (nz--) { 84 oidx = 2*(*vi++); 85 x[oidx] -= v[0]*s1 + v[1]*s2; 86 x[oidx+1] -= v[2]*s1 + v[3]*s2; 87 v += 4; 88 } 89 x[idx] = s1;x[1+idx] = s2; 90 idx += 2; 91 } 92 /* backward solve the L^T */ 93 for (i=n-1; i>=0; i--){ 94 v = aa + 4*diag[i] - 4; 95 vi = aj + diag[i] - 1; 96 nz = diag[i] - ai[i]; 97 idt = 2*i; 98 s1 = x[idt]; s2 = x[1+idt]; 99 while (nz--) { 100 idx = 2*(*vi--); 101 x[idx] -= v[0]*s1 + v[1]*s2; 102 x[idx+1] -= v[2]*s1 + v[3]*s2; 103 v -= 4; 104 } 105 } 106 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 107 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 108 PLogFlops(2*4*(a->nz) - 2*a->n); 109 PetscFunctionReturn(0); 110 } 111 112 #undef __FUNC__ 113 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering" 114 int MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 115 { 116 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 117 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 118 int *diag = a->diag,oidx; 119 MatScalar *aa=a->a,*v; 120 Scalar s1,s2,s3,x1,x2,x3; 121 Scalar *x,*b; 122 123 PetscFunctionBegin; 124 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 125 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 126 127 /* forward solve the U^T */ 128 idx = 0; 129 for (i=0; i<n; i++) { 130 131 v = aa + 9*diag[i]; 132 /* multiply by the inverse of the block diagonal */ 133 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; 134 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 135 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 136 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 137 v += 9; 138 139 vi = aj + diag[i] + 1; 140 nz = ai[i+1] - diag[i] - 1; 141 while (nz--) { 142 oidx = 3*(*vi++); 143 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 144 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 145 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 146 v += 9; 147 } 148 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 149 idx += 3; 150 } 151 /* backward solve the L^T */ 152 for (i=n-1; i>=0; i--){ 153 v = aa + 9*diag[i] - 9; 154 vi = aj + diag[i] - 1; 155 nz = diag[i] - ai[i]; 156 idt = 3*i; 157 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 158 while (nz--) { 159 idx = 3*(*vi--); 160 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 161 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 162 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 163 v -= 9; 164 } 165 } 166 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 168 PLogFlops(2*9*(a->nz) - 3*a->n); 169 PetscFunctionReturn(0); 170 } 171 172 #undef __FUNC__ 173 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering" 174 int MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 175 { 176 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 177 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 178 int *diag = a->diag,oidx; 179 MatScalar *aa=a->a,*v; 180 Scalar s1,s2,s3,s4,x1,x2,x3,x4; 181 Scalar *x,*b; 182 183 PetscFunctionBegin; 184 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 185 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 186 187 /* forward solve the U^T */ 188 idx = 0; 189 for (i=0; i<n; i++) { 190 191 v = aa + 16*diag[i]; 192 /* multiply by the inverse of the block diagonal */ 193 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; 194 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 195 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 196 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 197 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 198 v += 16; 199 200 vi = aj + diag[i] + 1; 201 nz = ai[i+1] - diag[i] - 1; 202 while (nz--) { 203 oidx = 4*(*vi++); 204 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 205 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 206 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 207 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 208 v += 16; 209 } 210 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 211 idx += 4; 212 } 213 /* backward solve the L^T */ 214 for (i=n-1; i>=0; i--){ 215 v = aa + 16*diag[i] - 16; 216 vi = aj + diag[i] - 1; 217 nz = diag[i] - ai[i]; 218 idt = 4*i; 219 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 220 while (nz--) { 221 idx = 4*(*vi--); 222 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 223 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 224 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 225 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 226 v -= 16; 227 } 228 } 229 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 230 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 231 PLogFlops(2*16*(a->nz) - 4*a->n); 232 PetscFunctionReturn(0); 233 } 234 235 #undef __FUNC__ 236 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering" 237 int MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 238 { 239 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 240 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 241 int *diag = a->diag,oidx; 242 MatScalar *aa=a->a,*v; 243 Scalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 244 Scalar *x,*b; 245 246 PetscFunctionBegin; 247 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 248 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 249 250 /* forward solve the U^T */ 251 idx = 0; 252 for (i=0; i<n; i++) { 253 254 v = aa + 25*diag[i]; 255 /* multiply by the inverse of the block diagonal */ 256 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx]; 257 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 258 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 259 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 260 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 261 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 262 v += 25; 263 264 vi = aj + diag[i] + 1; 265 nz = ai[i+1] - diag[i] - 1; 266 while (nz--) { 267 oidx = 5*(*vi++); 268 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 269 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 270 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 271 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 272 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 273 v += 25; 274 } 275 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 276 idx += 5; 277 } 278 /* backward solve the L^T */ 279 for (i=n-1; i>=0; i--){ 280 v = aa + 25*diag[i] - 25; 281 vi = aj + diag[i] - 1; 282 nz = diag[i] - ai[i]; 283 idt = 5*i; 284 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 285 while (nz--) { 286 idx = 5*(*vi--); 287 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 288 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 289 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 290 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 291 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 292 v -= 25; 293 } 294 } 295 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 296 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 297 PLogFlops(2*25*(a->nz) - 5*a->n); 298 PetscFunctionReturn(0); 299 } 300 301 #undef __FUNC__ 302 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering" 303 int MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 304 { 305 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 306 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 307 int *diag = a->diag,oidx; 308 MatScalar *aa=a->a,*v; 309 Scalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 310 Scalar *x,*b; 311 312 PetscFunctionBegin; 313 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 314 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 315 316 /* forward solve the U^T */ 317 idx = 0; 318 for (i=0; i<n; i++) { 319 320 v = aa + 36*diag[i]; 321 /* multiply by the inverse of the block diagonal */ 322 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx]; 323 x6 = b[5+idx]; 324 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 325 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 326 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 327 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 328 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 329 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 330 v += 36; 331 332 vi = aj + diag[i] + 1; 333 nz = ai[i+1] - diag[i] - 1; 334 while (nz--) { 335 oidx = 6*(*vi++); 336 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 337 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 338 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 339 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 340 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 341 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 342 v += 36; 343 } 344 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 345 x[5+idx] = s6; 346 idx += 6; 347 } 348 /* backward solve the L^T */ 349 for (i=n-1; i>=0; i--){ 350 v = aa + 36*diag[i] - 36; 351 vi = aj + diag[i] - 1; 352 nz = diag[i] - ai[i]; 353 idt = 6*i; 354 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 355 s6 = x[5+idt]; 356 while (nz--) { 357 idx = 6*(*vi--); 358 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 359 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 360 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 361 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 362 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 363 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 364 v -= 36; 365 } 366 } 367 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 368 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 369 PLogFlops(2*36*(a->nz) - 6*a->n); 370 PetscFunctionReturn(0); 371 } 372 373 #undef __FUNC__ 374 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering" 375 int MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 376 { 377 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 378 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 379 int *diag = a->diag,oidx; 380 MatScalar *aa=a->a,*v; 381 Scalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 382 Scalar *x,*b; 383 384 PetscFunctionBegin; 385 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 386 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 387 388 /* forward solve the U^T */ 389 idx = 0; 390 for (i=0; i<n; i++) { 391 392 v = aa + 49*diag[i]; 393 /* multiply by the inverse of the block diagonal */ 394 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx]; 395 x6 = b[5+idx]; x7 = b[6+idx]; 396 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 397 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 398 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 399 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 400 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 401 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 402 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 403 v += 49; 404 405 vi = aj + diag[i] + 1; 406 nz = ai[i+1] - diag[i] - 1; 407 while (nz--) { 408 oidx = 7*(*vi++); 409 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 410 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 411 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 412 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 413 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 414 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 415 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 416 v += 49; 417 } 418 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 419 x[5+idx] = s6;x[6+idx] = s7; 420 idx += 7; 421 } 422 /* backward solve the L^T */ 423 for (i=n-1; i>=0; i--){ 424 v = aa + 49*diag[i] - 49; 425 vi = aj + diag[i] - 1; 426 nz = diag[i] - ai[i]; 427 idt = 7*i; 428 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 429 s6 = x[5+idt];s7 = x[6+idt]; 430 while (nz--) { 431 idx = 7*(*vi--); 432 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 433 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 434 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 435 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 436 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 437 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 438 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 439 v -= 49; 440 } 441 } 442 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 443 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 444 PLogFlops(2*49*(a->nz) - 7*a->n); 445 PetscFunctionReturn(0); 446 } 447 448 /*---------------------------------------------------------------------------------------------*/ 449 #undef __FUNC__ 450 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_1" 451 int MatSolveTranspose_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 452 { 453 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 454 IS iscol=a->col,isrow=a->row; 455 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 456 int *diag = a->diag; 457 MatScalar *aa=a->a,*v; 458 Scalar s1,*x,*b,*t; 459 460 PetscFunctionBegin; 461 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 462 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 463 t = a->solve_work; 464 465 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 466 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 467 468 /* copy the b into temp work space according to permutation */ 469 for (i=0; i<n; i++) { 470 t[i] = b[c[i]]; 471 } 472 473 /* forward solve the U^T */ 474 for (i=0; i<n; i++) { 475 476 v = aa + diag[i]; 477 /* multiply by the inverse of the block diagonal */ 478 s1 = (*v++)*t[i]; 479 vi = aj + diag[i] + 1; 480 nz = ai[i+1] - diag[i] - 1; 481 while (nz--) { 482 t[*vi++] -= (*v++)*s1; 483 } 484 t[i] = s1; 485 } 486 /* backward solve the L^T */ 487 for (i=n-1; i>=0; i--){ 488 v = aa + diag[i] - 1; 489 vi = aj + diag[i] - 1; 490 nz = diag[i] - ai[i]; 491 s1 = t[i]; 492 while (nz--) { 493 t[*vi--] -= (*v--)*s1; 494 } 495 } 496 497 /* copy t into x according to permutation */ 498 for (i=0; i<n; i++) { 499 x[r[i]] = t[i]; 500 } 501 502 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 503 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 504 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 505 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 506 PLogFlops(2*(a->nz) - a->n); 507 PetscFunctionReturn(0); 508 } 509 510 #undef __FUNC__ 511 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_2" 512 int MatSolveTranspose_SeqSBAIJ_2(Mat A,Vec bb,Vec xx) 513 { 514 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 515 IS iscol=a->col,isrow=a->row; 516 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 517 int *diag = a->diag,ii,ic,ir,oidx; 518 MatScalar *aa=a->a,*v; 519 Scalar s1,s2,x1,x2; 520 Scalar *x,*b,*t; 521 522 PetscFunctionBegin; 523 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 524 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 525 t = a->solve_work; 526 527 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 528 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 529 530 /* copy the b into temp work space according to permutation */ 531 ii = 0; 532 for (i=0; i<n; i++) { 533 ic = 2*c[i]; 534 t[ii] = b[ic]; 535 t[ii+1] = b[ic+1]; 536 ii += 2; 537 } 538 539 /* forward solve the U^T */ 540 idx = 0; 541 for (i=0; i<n; i++) { 542 543 v = aa + 4*diag[i]; 544 /* multiply by the inverse of the block diagonal */ 545 x1 = t[idx]; x2 = t[1+idx]; 546 s1 = v[0]*x1 + v[1]*x2; 547 s2 = v[2]*x1 + v[3]*x2; 548 v += 4; 549 550 vi = aj + diag[i] + 1; 551 nz = ai[i+1] - diag[i] - 1; 552 while (nz--) { 553 oidx = 2*(*vi++); 554 t[oidx] -= v[0]*s1 + v[1]*s2; 555 t[oidx+1] -= v[2]*s1 + v[3]*s2; 556 v += 4; 557 } 558 t[idx] = s1;t[1+idx] = s2; 559 idx += 2; 560 } 561 /* backward solve the L^T */ 562 for (i=n-1; i>=0; i--){ 563 v = aa + 4*diag[i] - 4; 564 vi = aj + diag[i] - 1; 565 nz = diag[i] - ai[i]; 566 idt = 2*i; 567 s1 = t[idt]; s2 = t[1+idt]; 568 while (nz--) { 569 idx = 2*(*vi--); 570 t[idx] -= v[0]*s1 + v[1]*s2; 571 t[idx+1] -= v[2]*s1 + v[3]*s2; 572 v -= 4; 573 } 574 } 575 576 /* copy t into x according to permutation */ 577 ii = 0; 578 for (i=0; i<n; i++) { 579 ir = 2*r[i]; 580 x[ir] = t[ii]; 581 x[ir+1] = t[ii+1]; 582 ii += 2; 583 } 584 585 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 586 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 587 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 588 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 589 PLogFlops(2*4*(a->nz) - 2*a->n); 590 PetscFunctionReturn(0); 591 } 592 593 #undef __FUNC__ 594 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_3" 595 int MatSolveTranspose_SeqSBAIJ_3(Mat A,Vec bb,Vec xx) 596 { 597 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 598 IS iscol=a->col,isrow=a->row; 599 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 600 int *diag = a->diag,ii,ic,ir,oidx; 601 MatScalar *aa=a->a,*v; 602 Scalar s1,s2,s3,x1,x2,x3; 603 Scalar *x,*b,*t; 604 605 PetscFunctionBegin; 606 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 607 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 608 t = a->solve_work; 609 610 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 611 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 612 613 /* copy the b into temp work space according to permutation */ 614 ii = 0; 615 for (i=0; i<n; i++) { 616 ic = 3*c[i]; 617 t[ii] = b[ic]; 618 t[ii+1] = b[ic+1]; 619 t[ii+2] = b[ic+2]; 620 ii += 3; 621 } 622 623 /* forward solve the U^T */ 624 idx = 0; 625 for (i=0; i<n; i++) { 626 627 v = aa + 9*diag[i]; 628 /* multiply by the inverse of the block diagonal */ 629 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 630 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 631 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 632 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 633 v += 9; 634 635 vi = aj + diag[i] + 1; 636 nz = ai[i+1] - diag[i] - 1; 637 while (nz--) { 638 oidx = 3*(*vi++); 639 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 640 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 641 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 642 v += 9; 643 } 644 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 645 idx += 3; 646 } 647 /* backward solve the L^T */ 648 for (i=n-1; i>=0; i--){ 649 v = aa + 9*diag[i] - 9; 650 vi = aj + diag[i] - 1; 651 nz = diag[i] - ai[i]; 652 idt = 3*i; 653 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 654 while (nz--) { 655 idx = 3*(*vi--); 656 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 657 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 658 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 659 v -= 9; 660 } 661 } 662 663 /* copy t into x according to permutation */ 664 ii = 0; 665 for (i=0; i<n; i++) { 666 ir = 3*r[i]; 667 x[ir] = t[ii]; 668 x[ir+1] = t[ii+1]; 669 x[ir+2] = t[ii+2]; 670 ii += 3; 671 } 672 673 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 674 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 675 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 676 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 677 PLogFlops(2*9*(a->nz) - 3*a->n); 678 PetscFunctionReturn(0); 679 } 680 681 #undef __FUNC__ 682 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_4" 683 int MatSolveTranspose_SeqSBAIJ_4(Mat A,Vec bb,Vec xx) 684 { 685 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 686 IS iscol=a->col,isrow=a->row; 687 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 688 int *diag = a->diag,ii,ic,ir,oidx; 689 MatScalar *aa=a->a,*v; 690 Scalar s1,s2,s3,s4,x1,x2,x3,x4; 691 Scalar *x,*b,*t; 692 693 PetscFunctionBegin; 694 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 695 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 696 t = a->solve_work; 697 698 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 699 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 700 701 /* copy the b into temp work space according to permutation */ 702 ii = 0; 703 for (i=0; i<n; i++) { 704 ic = 4*c[i]; 705 t[ii] = b[ic]; 706 t[ii+1] = b[ic+1]; 707 t[ii+2] = b[ic+2]; 708 t[ii+3] = b[ic+3]; 709 ii += 4; 710 } 711 712 /* forward solve the U^T */ 713 idx = 0; 714 for (i=0; i<n; i++) { 715 716 v = aa + 16*diag[i]; 717 /* multiply by the inverse of the block diagonal */ 718 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 719 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 720 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 721 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 722 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 723 v += 16; 724 725 vi = aj + diag[i] + 1; 726 nz = ai[i+1] - diag[i] - 1; 727 while (nz--) { 728 oidx = 4*(*vi++); 729 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 730 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 731 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 732 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 733 v += 16; 734 } 735 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 736 idx += 4; 737 } 738 /* backward solve the L^T */ 739 for (i=n-1; i>=0; i--){ 740 v = aa + 16*diag[i] - 16; 741 vi = aj + diag[i] - 1; 742 nz = diag[i] - ai[i]; 743 idt = 4*i; 744 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 745 while (nz--) { 746 idx = 4*(*vi--); 747 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 748 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 749 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 750 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 751 v -= 16; 752 } 753 } 754 755 /* copy t into x according to permutation */ 756 ii = 0; 757 for (i=0; i<n; i++) { 758 ir = 4*r[i]; 759 x[ir] = t[ii]; 760 x[ir+1] = t[ii+1]; 761 x[ir+2] = t[ii+2]; 762 x[ir+3] = t[ii+3]; 763 ii += 4; 764 } 765 766 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 767 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 768 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 769 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 770 PLogFlops(2*16*(a->nz) - 4*a->n); 771 PetscFunctionReturn(0); 772 } 773 774 #undef __FUNC__ 775 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_5" 776 int MatSolveTranspose_SeqSBAIJ_5(Mat A,Vec bb,Vec xx) 777 { 778 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 779 IS iscol=a->col,isrow=a->row; 780 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 781 int *diag = a->diag,ii,ic,ir,oidx; 782 MatScalar *aa=a->a,*v; 783 Scalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 784 Scalar *x,*b,*t; 785 786 PetscFunctionBegin; 787 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 788 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 789 t = a->solve_work; 790 791 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 792 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 793 794 /* copy the b into temp work space according to permutation */ 795 ii = 0; 796 for (i=0; i<n; i++) { 797 ic = 5*c[i]; 798 t[ii] = b[ic]; 799 t[ii+1] = b[ic+1]; 800 t[ii+2] = b[ic+2]; 801 t[ii+3] = b[ic+3]; 802 t[ii+4] = b[ic+4]; 803 ii += 5; 804 } 805 806 /* forward solve the U^T */ 807 idx = 0; 808 for (i=0; i<n; i++) { 809 810 v = aa + 25*diag[i]; 811 /* multiply by the inverse of the block diagonal */ 812 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 813 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 814 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 815 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 816 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 817 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 818 v += 25; 819 820 vi = aj + diag[i] + 1; 821 nz = ai[i+1] - diag[i] - 1; 822 while (nz--) { 823 oidx = 5*(*vi++); 824 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 825 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 826 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 827 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 828 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 829 v += 25; 830 } 831 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 832 idx += 5; 833 } 834 /* backward solve the L^T */ 835 for (i=n-1; i>=0; i--){ 836 v = aa + 25*diag[i] - 25; 837 vi = aj + diag[i] - 1; 838 nz = diag[i] - ai[i]; 839 idt = 5*i; 840 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 841 while (nz--) { 842 idx = 5*(*vi--); 843 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 844 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 845 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 846 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 847 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 848 v -= 25; 849 } 850 } 851 852 /* copy t into x according to permutation */ 853 ii = 0; 854 for (i=0; i<n; i++) { 855 ir = 5*r[i]; 856 x[ir] = t[ii]; 857 x[ir+1] = t[ii+1]; 858 x[ir+2] = t[ii+2]; 859 x[ir+3] = t[ii+3]; 860 x[ir+4] = t[ii+4]; 861 ii += 5; 862 } 863 864 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 865 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 866 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 867 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 868 PLogFlops(2*25*(a->nz) - 5*a->n); 869 PetscFunctionReturn(0); 870 } 871 872 #undef __FUNC__ 873 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_6" 874 int MatSolveTranspose_SeqSBAIJ_6(Mat A,Vec bb,Vec xx) 875 { 876 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 877 IS iscol=a->col,isrow=a->row; 878 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 879 int *diag = a->diag,ii,ic,ir,oidx; 880 MatScalar *aa=a->a,*v; 881 Scalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 882 Scalar *x,*b,*t; 883 884 PetscFunctionBegin; 885 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 886 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 887 t = a->solve_work; 888 889 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 890 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 891 892 /* copy the b into temp work space according to permutation */ 893 ii = 0; 894 for (i=0; i<n; i++) { 895 ic = 6*c[i]; 896 t[ii] = b[ic]; 897 t[ii+1] = b[ic+1]; 898 t[ii+2] = b[ic+2]; 899 t[ii+3] = b[ic+3]; 900 t[ii+4] = b[ic+4]; 901 t[ii+5] = b[ic+5]; 902 ii += 6; 903 } 904 905 /* forward solve the U^T */ 906 idx = 0; 907 for (i=0; i<n; i++) { 908 909 v = aa + 36*diag[i]; 910 /* multiply by the inverse of the block diagonal */ 911 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 912 x6 = t[5+idx]; 913 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 914 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 915 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 916 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 917 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 918 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 919 v += 36; 920 921 vi = aj + diag[i] + 1; 922 nz = ai[i+1] - diag[i] - 1; 923 while (nz--) { 924 oidx = 6*(*vi++); 925 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 926 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 927 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 928 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 929 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 930 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 931 v += 36; 932 } 933 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 934 t[5+idx] = s6; 935 idx += 6; 936 } 937 /* backward solve the L^T */ 938 for (i=n-1; i>=0; i--){ 939 v = aa + 36*diag[i] - 36; 940 vi = aj + diag[i] - 1; 941 nz = diag[i] - ai[i]; 942 idt = 6*i; 943 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 944 s6 = t[5+idt]; 945 while (nz--) { 946 idx = 6*(*vi--); 947 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 948 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 949 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 950 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 951 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 952 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 953 v -= 36; 954 } 955 } 956 957 /* copy t into x according to permutation */ 958 ii = 0; 959 for (i=0; i<n; i++) { 960 ir = 6*r[i]; 961 x[ir] = t[ii]; 962 x[ir+1] = t[ii+1]; 963 x[ir+2] = t[ii+2]; 964 x[ir+3] = t[ii+3]; 965 x[ir+4] = t[ii+4]; 966 x[ir+5] = t[ii+5]; 967 ii += 6; 968 } 969 970 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 971 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 972 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 973 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 974 PLogFlops(2*36*(a->nz) - 6*a->n); 975 PetscFunctionReturn(0); 976 } 977 978 #undef __FUNC__ 979 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_7" 980 int MatSolveTranspose_SeqSBAIJ_7(Mat A,Vec bb,Vec xx) 981 { 982 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 983 IS iscol=a->col,isrow=a->row; 984 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 985 int *diag = a->diag,ii,ic,ir,oidx; 986 MatScalar *aa=a->a,*v; 987 Scalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 988 Scalar *x,*b,*t; 989 990 PetscFunctionBegin; 991 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 992 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 993 t = a->solve_work; 994 995 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 996 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 997 998 /* copy the b into temp work space according to permutation */ 999 ii = 0; 1000 for (i=0; i<n; i++) { 1001 ic = 7*c[i]; 1002 t[ii] = b[ic]; 1003 t[ii+1] = b[ic+1]; 1004 t[ii+2] = b[ic+2]; 1005 t[ii+3] = b[ic+3]; 1006 t[ii+4] = b[ic+4]; 1007 t[ii+5] = b[ic+5]; 1008 t[ii+6] = b[ic+6]; 1009 ii += 7; 1010 } 1011 1012 /* forward solve the U^T */ 1013 idx = 0; 1014 for (i=0; i<n; i++) { 1015 1016 v = aa + 49*diag[i]; 1017 /* multiply by the inverse of the block diagonal */ 1018 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1019 x6 = t[5+idx]; x7 = t[6+idx]; 1020 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1021 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1022 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1023 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1024 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1025 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1026 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1027 v += 49; 1028 1029 vi = aj + diag[i] + 1; 1030 nz = ai[i+1] - diag[i] - 1; 1031 while (nz--) { 1032 oidx = 7*(*vi++); 1033 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1034 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1035 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1036 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1037 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1038 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1039 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1040 v += 49; 1041 } 1042 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1043 t[5+idx] = s6;t[6+idx] = s7; 1044 idx += 7; 1045 } 1046 /* backward solve the L^T */ 1047 for (i=n-1; i>=0; i--){ 1048 v = aa + 49*diag[i] - 49; 1049 vi = aj + diag[i] - 1; 1050 nz = diag[i] - ai[i]; 1051 idt = 7*i; 1052 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1053 s6 = t[5+idt];s7 = t[6+idt]; 1054 while (nz--) { 1055 idx = 7*(*vi--); 1056 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1057 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1058 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1059 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1060 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1061 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1062 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1063 v -= 49; 1064 } 1065 } 1066 1067 /* copy t into x according to permutation */ 1068 ii = 0; 1069 for (i=0; i<n; i++) { 1070 ir = 7*r[i]; 1071 x[ir] = t[ii]; 1072 x[ir+1] = t[ii+1]; 1073 x[ir+2] = t[ii+2]; 1074 x[ir+3] = t[ii+3]; 1075 x[ir+4] = t[ii+4]; 1076 x[ir+5] = t[ii+5]; 1077 x[ir+6] = t[ii+6]; 1078 ii += 7; 1079 } 1080 1081 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1082 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1083 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1084 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1085 PLogFlops(2*49*(a->nz) - 7*a->n); 1086 PetscFunctionReturn(0); 1087 } 1088 1089 /* ----------------------------------------------------------- */ 1090 #undef __FUNC__ 1091 #define __FUNC__ "MatSolve_SeqSBAIJ_N" 1092 int MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx) 1093 { 1094 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1095 IS iscol=a->col,isrow=a->row; 1096 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1097 int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 1098 MatScalar *aa=a->a,*v; 1099 Scalar *x,*b,*s,*t,*ls; 1100 1101 PetscFunctionBegin; 1102 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1103 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1104 t = a->solve_work; 1105 1106 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1107 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1108 1109 /* forward solve the lower triangular */ 1110 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr); 1111 for (i=1; i<n; i++) { 1112 v = aa + bs2*ai[i]; 1113 vi = aj + ai[i]; 1114 nz = a->diag[i] - ai[i]; 1115 s = t + bs*i; 1116 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr); 1117 while (nz--) { 1118 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1119 v += bs2; 1120 } 1121 } 1122 /* backward solve the upper triangular */ 1123 ls = a->solve_work + a->n; 1124 for (i=n-1; i>=0; i--){ 1125 v = aa + bs2*(a->diag[i] + 1); 1126 vi = aj + a->diag[i] + 1; 1127 nz = ai[i+1] - a->diag[i] - 1; 1128 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr); 1129 while (nz--) { 1130 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1131 v += bs2; 1132 } 1133 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1134 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr); 1135 } 1136 1137 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1138 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1139 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1140 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1141 PLogFlops(2*(a->bs2)*(a->nz) - a->bs*a->n); 1142 PetscFunctionReturn(0); 1143 } 1144 1145 #undef __FUNC__ 1146 #define __FUNC__ "MatSolve_SeqSBAIJ_7" 1147 int MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx) 1148 { 1149 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1150 IS iscol=a->col,isrow=a->row; 1151 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1152 int *diag = a->diag; 1153 MatScalar *aa=a->a,*v; 1154 Scalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1155 Scalar *x,*b,*t; 1156 1157 PetscFunctionBegin; 1158 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1159 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1160 t = a->solve_work; 1161 1162 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1163 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1164 1165 /* forward solve the lower triangular */ 1166 idx = 7*(*r++); 1167 t[0] = b[idx]; t[1] = b[1+idx]; 1168 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1169 t[5] = b[5+idx]; t[6] = b[6+idx]; 1170 1171 for (i=1; i<n; i++) { 1172 v = aa + 49*ai[i]; 1173 vi = aj + ai[i]; 1174 nz = diag[i] - ai[i]; 1175 idx = 7*(*r++); 1176 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1177 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1178 while (nz--) { 1179 idx = 7*(*vi++); 1180 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1181 x4 = t[3+idx];x5 = t[4+idx]; 1182 x6 = t[5+idx];x7 = t[6+idx]; 1183 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1184 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1185 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1186 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1187 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1188 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1189 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1190 v += 49; 1191 } 1192 idx = 7*i; 1193 t[idx] = s1;t[1+idx] = s2; 1194 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1195 t[5+idx] = s6;t[6+idx] = s7; 1196 } 1197 /* backward solve the upper triangular */ 1198 for (i=n-1; i>=0; i--){ 1199 v = aa + 49*diag[i] + 49; 1200 vi = aj + diag[i] + 1; 1201 nz = ai[i+1] - diag[i] - 1; 1202 idt = 7*i; 1203 s1 = t[idt]; s2 = t[1+idt]; 1204 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1205 s6 = t[5+idt];s7 = t[6+idt]; 1206 while (nz--) { 1207 idx = 7*(*vi++); 1208 x1 = t[idx]; x2 = t[1+idx]; 1209 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1210 x6 = t[5+idx]; x7 = t[6+idx]; 1211 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1212 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1213 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1214 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1215 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1216 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1217 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1218 v += 49; 1219 } 1220 idc = 7*(*c--); 1221 v = aa + 49*diag[i]; 1222 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1223 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1224 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1225 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1226 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1227 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1228 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1229 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1230 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1231 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1232 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1233 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1234 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1235 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1236 } 1237 1238 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1239 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1240 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1241 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1242 PLogFlops(2*49*(a->nz) - 7*a->n); 1243 PetscFunctionReturn(0); 1244 } 1245 1246 #undef __FUNC__ 1247 #define __FUNC__ "MatSolve_SeqSBAIJ_7_NaturalOrdering" 1248 int MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1249 { 1250 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1251 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1252 int ierr,*diag = a->diag,jdx; 1253 MatScalar *aa=a->a,*v; 1254 Scalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1255 1256 PetscFunctionBegin; 1257 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1258 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1259 /* forward solve the lower triangular */ 1260 idx = 0; 1261 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1262 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1263 x[6] = b[6+idx]; 1264 for (i=1; i<n; i++) { 1265 v = aa + 49*ai[i]; 1266 vi = aj + ai[i]; 1267 nz = diag[i] - ai[i]; 1268 idx = 7*i; 1269 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1270 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1271 s7 = b[6+idx]; 1272 while (nz--) { 1273 jdx = 7*(*vi++); 1274 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1275 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1276 x7 = x[6+jdx]; 1277 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1278 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1279 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1280 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1281 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1282 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1283 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1284 v += 49; 1285 } 1286 x[idx] = s1; 1287 x[1+idx] = s2; 1288 x[2+idx] = s3; 1289 x[3+idx] = s4; 1290 x[4+idx] = s5; 1291 x[5+idx] = s6; 1292 x[6+idx] = s7; 1293 } 1294 /* backward solve the upper triangular */ 1295 for (i=n-1; i>=0; i--){ 1296 v = aa + 49*diag[i] + 49; 1297 vi = aj + diag[i] + 1; 1298 nz = ai[i+1] - diag[i] - 1; 1299 idt = 7*i; 1300 s1 = x[idt]; s2 = x[1+idt]; 1301 s3 = x[2+idt]; s4 = x[3+idt]; 1302 s5 = x[4+idt]; s6 = x[5+idt]; 1303 s7 = x[6+idt]; 1304 while (nz--) { 1305 idx = 7*(*vi++); 1306 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1307 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1308 x7 = x[6+idx]; 1309 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1310 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1311 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1312 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1313 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1314 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1315 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1316 v += 49; 1317 } 1318 v = aa + 49*diag[i]; 1319 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1320 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1321 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1322 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1323 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1324 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1325 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1326 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1327 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1328 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1329 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1330 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1331 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1332 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1333 } 1334 1335 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1336 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1337 PLogFlops(2*36*(a->nz) - 6*a->n); 1338 PetscFunctionReturn(0); 1339 } 1340 1341 #undef __FUNC__ 1342 #define __FUNC__ "MatSolve_SeqSBAIJ_6" 1343 int MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx) 1344 { 1345 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1346 IS iscol=a->col,isrow=a->row; 1347 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1348 int *diag = a->diag; 1349 MatScalar *aa=a->a,*v; 1350 Scalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1351 1352 PetscFunctionBegin; 1353 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1354 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1355 t = a->solve_work; 1356 1357 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1358 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1359 1360 /* forward solve the lower triangular */ 1361 idx = 6*(*r++); 1362 t[0] = b[idx]; t[1] = b[1+idx]; 1363 t[2] = b[2+idx]; t[3] = b[3+idx]; 1364 t[4] = b[4+idx]; t[5] = b[5+idx]; 1365 for (i=1; i<n; i++) { 1366 v = aa + 36*ai[i]; 1367 vi = aj + ai[i]; 1368 nz = diag[i] - ai[i]; 1369 idx = 6*(*r++); 1370 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1371 s5 = b[4+idx]; s6 = b[5+idx]; 1372 while (nz--) { 1373 idx = 6*(*vi++); 1374 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1375 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1376 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1377 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1378 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1379 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1380 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1381 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1382 v += 36; 1383 } 1384 idx = 6*i; 1385 t[idx] = s1;t[1+idx] = s2; 1386 t[2+idx] = s3;t[3+idx] = s4; 1387 t[4+idx] = s5;t[5+idx] = s6; 1388 } 1389 /* backward solve the upper triangular */ 1390 for (i=n-1; i>=0; i--){ 1391 v = aa + 36*diag[i] + 36; 1392 vi = aj + diag[i] + 1; 1393 nz = ai[i+1] - diag[i] - 1; 1394 idt = 6*i; 1395 s1 = t[idt]; s2 = t[1+idt]; 1396 s3 = t[2+idt];s4 = t[3+idt]; 1397 s5 = t[4+idt];s6 = t[5+idt]; 1398 while (nz--) { 1399 idx = 6*(*vi++); 1400 x1 = t[idx]; x2 = t[1+idx]; 1401 x3 = t[2+idx]; x4 = t[3+idx]; 1402 x5 = t[4+idx]; x6 = t[5+idx]; 1403 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1404 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1405 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1406 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1407 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1408 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1409 v += 36; 1410 } 1411 idc = 6*(*c--); 1412 v = aa + 36*diag[i]; 1413 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1414 v[18]*s4+v[24]*s5+v[30]*s6; 1415 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1416 v[19]*s4+v[25]*s5+v[31]*s6; 1417 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1418 v[20]*s4+v[26]*s5+v[32]*s6; 1419 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1420 v[21]*s4+v[27]*s5+v[33]*s6; 1421 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1422 v[22]*s4+v[28]*s5+v[34]*s6; 1423 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1424 v[23]*s4+v[29]*s5+v[35]*s6; 1425 } 1426 1427 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1428 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1429 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1430 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1431 PLogFlops(2*36*(a->nz) - 6*a->n); 1432 PetscFunctionReturn(0); 1433 } 1434 1435 #undef __FUNC__ 1436 #define __FUNC__ "MatSolve_SeqSBAIJ_6_NaturalOrdering" 1437 int MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1438 { 1439 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1440 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1441 int ierr,*diag = a->diag,jdx; 1442 MatScalar *aa=a->a,*v; 1443 Scalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1444 1445 PetscFunctionBegin; 1446 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1447 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1448 /* forward solve the lower triangular */ 1449 idx = 0; 1450 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1451 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1452 for (i=1; i<n; i++) { 1453 v = aa + 36*ai[i]; 1454 vi = aj + ai[i]; 1455 nz = diag[i] - ai[i]; 1456 idx = 6*i; 1457 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1458 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1459 while (nz--) { 1460 jdx = 6*(*vi++); 1461 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1462 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1463 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1464 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1465 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1466 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1467 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1468 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1469 v += 36; 1470 } 1471 x[idx] = s1; 1472 x[1+idx] = s2; 1473 x[2+idx] = s3; 1474 x[3+idx] = s4; 1475 x[4+idx] = s5; 1476 x[5+idx] = s6; 1477 } 1478 /* backward solve the upper triangular */ 1479 for (i=n-1; i>=0; i--){ 1480 v = aa + 36*diag[i] + 36; 1481 vi = aj + diag[i] + 1; 1482 nz = ai[i+1] - diag[i] - 1; 1483 idt = 6*i; 1484 s1 = x[idt]; s2 = x[1+idt]; 1485 s3 = x[2+idt]; s4 = x[3+idt]; 1486 s5 = x[4+idt]; s6 = x[5+idt]; 1487 while (nz--) { 1488 idx = 6*(*vi++); 1489 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1490 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1491 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1492 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1493 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1494 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1495 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1496 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1497 v += 36; 1498 } 1499 v = aa + 36*diag[i]; 1500 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1501 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1502 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1503 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1504 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1505 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1506 } 1507 1508 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1509 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1510 PLogFlops(2*36*(a->nz) - 6*a->n); 1511 PetscFunctionReturn(0); 1512 } 1513 1514 #undef __FUNC__ 1515 #define __FUNC__ "MatSolve_SeqSBAIJ_5" 1516 int MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx) 1517 { 1518 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1519 IS iscol=a->col,isrow=a->row; 1520 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1521 int *diag = a->diag; 1522 MatScalar *aa=a->a,*v; 1523 Scalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1524 1525 PetscFunctionBegin; 1526 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1527 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1528 t = a->solve_work; 1529 1530 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1531 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1532 1533 /* forward solve the lower triangular */ 1534 idx = 5*(*r++); 1535 t[0] = b[idx]; t[1] = b[1+idx]; 1536 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1537 for (i=1; i<n; i++) { 1538 v = aa + 25*ai[i]; 1539 vi = aj + ai[i]; 1540 nz = diag[i] - ai[i]; 1541 idx = 5*(*r++); 1542 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1543 s5 = b[4+idx]; 1544 while (nz--) { 1545 idx = 5*(*vi++); 1546 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1547 x4 = t[3+idx];x5 = t[4+idx]; 1548 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1549 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1550 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1551 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1552 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1553 v += 25; 1554 } 1555 idx = 5*i; 1556 t[idx] = s1;t[1+idx] = s2; 1557 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1558 } 1559 /* backward solve the upper triangular */ 1560 for (i=n-1; i>=0; i--){ 1561 v = aa + 25*diag[i] + 25; 1562 vi = aj + diag[i] + 1; 1563 nz = ai[i+1] - diag[i] - 1; 1564 idt = 5*i; 1565 s1 = t[idt]; s2 = t[1+idt]; 1566 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1567 while (nz--) { 1568 idx = 5*(*vi++); 1569 x1 = t[idx]; x2 = t[1+idx]; 1570 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1571 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1572 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1573 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1574 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1575 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1576 v += 25; 1577 } 1578 idc = 5*(*c--); 1579 v = aa + 25*diag[i]; 1580 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1581 v[15]*s4+v[20]*s5; 1582 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1583 v[16]*s4+v[21]*s5; 1584 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1585 v[17]*s4+v[22]*s5; 1586 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1587 v[18]*s4+v[23]*s5; 1588 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1589 v[19]*s4+v[24]*s5; 1590 } 1591 1592 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1593 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1594 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1595 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1596 PLogFlops(2*25*(a->nz) - 5*a->n); 1597 PetscFunctionReturn(0); 1598 } 1599 1600 #undef __FUNC__ 1601 #define __FUNC__ "MatSolve_SeqSBAIJ_5_NaturalOrdering" 1602 int MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1603 { 1604 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1605 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1606 int ierr,*diag = a->diag,jdx; 1607 MatScalar *aa=a->a,*v; 1608 Scalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1609 1610 PetscFunctionBegin; 1611 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1612 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1613 /* forward solve the lower triangular */ 1614 idx = 0; 1615 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1616 for (i=1; i<n; i++) { 1617 v = aa + 25*ai[i]; 1618 vi = aj + ai[i]; 1619 nz = diag[i] - ai[i]; 1620 idx = 5*i; 1621 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1622 while (nz--) { 1623 jdx = 5*(*vi++); 1624 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1625 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1626 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1627 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1628 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1629 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1630 v += 25; 1631 } 1632 x[idx] = s1; 1633 x[1+idx] = s2; 1634 x[2+idx] = s3; 1635 x[3+idx] = s4; 1636 x[4+idx] = s5; 1637 } 1638 /* backward solve the upper triangular */ 1639 for (i=n-1; i>=0; i--){ 1640 v = aa + 25*diag[i] + 25; 1641 vi = aj + diag[i] + 1; 1642 nz = ai[i+1] - diag[i] - 1; 1643 idt = 5*i; 1644 s1 = x[idt]; s2 = x[1+idt]; 1645 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1646 while (nz--) { 1647 idx = 5*(*vi++); 1648 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1649 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1650 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1651 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1652 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1653 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1654 v += 25; 1655 } 1656 v = aa + 25*diag[i]; 1657 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1658 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1659 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1660 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1661 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1662 } 1663 1664 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1665 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1666 PLogFlops(2*25*(a->nz) - 5*a->n); 1667 PetscFunctionReturn(0); 1668 } 1669 1670 #undef __FUNC__ 1671 #define __FUNC__ "MatSolve_SeqSBAIJ_4" 1672 int MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx) 1673 { 1674 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1675 IS iscol=a->col,isrow=a->row; 1676 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1677 int *diag = a->diag; 1678 MatScalar *aa=a->a,*v; 1679 Scalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1680 1681 PetscFunctionBegin; 1682 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1683 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1684 t = a->solve_work; 1685 1686 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1687 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1688 1689 /* forward solve the lower triangular */ 1690 idx = 4*(*r++); 1691 t[0] = b[idx]; t[1] = b[1+idx]; 1692 t[2] = b[2+idx]; t[3] = b[3+idx]; 1693 for (i=1; i<n; i++) { 1694 v = aa + 16*ai[i]; 1695 vi = aj + ai[i]; 1696 nz = diag[i] - ai[i]; 1697 idx = 4*(*r++); 1698 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1699 while (nz--) { 1700 idx = 4*(*vi++); 1701 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1702 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1703 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1704 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1705 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1706 v += 16; 1707 } 1708 idx = 4*i; 1709 t[idx] = s1;t[1+idx] = s2; 1710 t[2+idx] = s3;t[3+idx] = s4; 1711 } 1712 /* backward solve the upper triangular */ 1713 for (i=n-1; i>=0; i--){ 1714 v = aa + 16*diag[i] + 16; 1715 vi = aj + diag[i] + 1; 1716 nz = ai[i+1] - diag[i] - 1; 1717 idt = 4*i; 1718 s1 = t[idt]; s2 = t[1+idt]; 1719 s3 = t[2+idt];s4 = t[3+idt]; 1720 while (nz--) { 1721 idx = 4*(*vi++); 1722 x1 = t[idx]; x2 = t[1+idx]; 1723 x3 = t[2+idx]; x4 = t[3+idx]; 1724 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1725 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1726 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1727 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1728 v += 16; 1729 } 1730 idc = 4*(*c--); 1731 v = aa + 16*diag[i]; 1732 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1733 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1734 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1735 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1736 } 1737 1738 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1739 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1740 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1741 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1742 PLogFlops(2*16*(a->nz) - 4*a->n); 1743 PetscFunctionReturn(0); 1744 } 1745 1746 1747 /* 1748 Special case where the matrix was ILU(0) factored in the natural 1749 ordering. This eliminates the need for the column and row permutation. 1750 */ 1751 #undef __FUNC__ 1752 #define __FUNC__ "MatSolve_SeqSBAIJ_4_NaturalOrdering" 1753 int MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 1754 { 1755 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1756 int n=a->mbs,*ai=a->i,*aj=a->j; 1757 int ierr,*diag = a->diag; 1758 MatScalar *aa=a->a; 1759 Scalar *x,*b; 1760 1761 PetscFunctionBegin; 1762 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1763 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1764 1765 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 1766 { 1767 static Scalar w[2000]; /* very BAD need to fix */ 1768 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 1769 } 1770 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 1771 { 1772 static Scalar w[2000]; /* very BAD need to fix */ 1773 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 1774 } 1775 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 1776 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 1777 #else 1778 { 1779 Scalar s1,s2,s3,s4,x1,x2,x3,x4; 1780 MatScalar *v; 1781 int jdx,idt,idx,nz,*vi,i,ai16; 1782 1783 /* forward solve the lower triangular */ 1784 idx = 0; 1785 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 1786 for (i=1; i<n; i++) { 1787 v = aa + 16*ai[i]; 1788 vi = aj + ai[i]; 1789 nz = diag[i] - ai[i]; 1790 idx += 4; 1791 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1792 while (nz--) { 1793 jdx = 4*(*vi++); 1794 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 1795 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1796 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1797 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1798 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1799 v += 16; 1800 } 1801 x[idx] = s1; 1802 x[1+idx] = s2; 1803 x[2+idx] = s3; 1804 x[3+idx] = s4; 1805 } 1806 /* backward solve the upper triangular */ 1807 idt = 4*(n-1); 1808 for (i=n-1; i>=0; i--){ 1809 ai16 = 16*diag[i]; 1810 v = aa + ai16 + 16; 1811 vi = aj + diag[i] + 1; 1812 nz = ai[i+1] - diag[i] - 1; 1813 s1 = x[idt]; s2 = x[1+idt]; 1814 s3 = x[2+idt];s4 = x[3+idt]; 1815 while (nz--) { 1816 idx = 4*(*vi++); 1817 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 1818 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1819 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1820 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1821 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1822 v += 16; 1823 } 1824 v = aa + ai16; 1825 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 1826 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 1827 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 1828 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 1829 idt -= 4; 1830 } 1831 } 1832 #endif 1833 1834 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1835 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1836 PLogFlops(2*16*(a->nz) - 4*a->n); 1837 PetscFunctionReturn(0); 1838 } 1839 1840 #undef __FUNC__ 1841 #define __FUNC__ "MatSolve_SeqSBAIJ_3" 1842 int MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx) 1843 { 1844 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1845 IS iscol=a->col,isrow=a->row; 1846 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1847 int *diag = a->diag; 1848 MatScalar *aa=a->a,*v; 1849 Scalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 1850 1851 PetscFunctionBegin; 1852 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1853 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1854 t = a->solve_work; 1855 1856 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1857 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1858 1859 /* forward solve the lower triangular */ 1860 idx = 3*(*r++); 1861 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 1862 for (i=1; i<n; i++) { 1863 v = aa + 9*ai[i]; 1864 vi = aj + ai[i]; 1865 nz = diag[i] - ai[i]; 1866 idx = 3*(*r++); 1867 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1868 while (nz--) { 1869 idx = 3*(*vi++); 1870 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1871 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1872 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1873 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1874 v += 9; 1875 } 1876 idx = 3*i; 1877 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 1878 } 1879 /* backward solve the upper triangular */ 1880 for (i=n-1; i>=0; i--){ 1881 v = aa + 9*diag[i] + 9; 1882 vi = aj + diag[i] + 1; 1883 nz = ai[i+1] - diag[i] - 1; 1884 idt = 3*i; 1885 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1886 while (nz--) { 1887 idx = 3*(*vi++); 1888 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1889 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1890 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1891 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1892 v += 9; 1893 } 1894 idc = 3*(*c--); 1895 v = aa + 9*diag[i]; 1896 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1897 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1898 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1899 } 1900 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1901 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1902 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1903 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1904 PLogFlops(2*9*(a->nz) - 3*a->n); 1905 PetscFunctionReturn(0); 1906 } 1907 1908 /* 1909 Special case where the matrix was ILU(0) factored in the natural 1910 ordering. This eliminates the need for the column and row permutation. 1911 */ 1912 #undef __FUNC__ 1913 #define __FUNC__ "MatSolve_SeqSBAIJ_3_NaturalOrdering" 1914 int MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 1915 { 1916 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1917 int n=a->mbs,*ai=a->i,*aj=a->j; 1918 int ierr,*diag = a->diag; 1919 MatScalar *aa=a->a,*v; 1920 Scalar *x,*b,s1,s2,s3,x1,x2,x3; 1921 int jdx,idt,idx,nz,*vi,i; 1922 1923 PetscFunctionBegin; 1924 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1925 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1926 1927 1928 /* forward solve the lower triangular */ 1929 idx = 0; 1930 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 1931 for (i=1; i<n; i++) { 1932 v = aa + 9*ai[i]; 1933 vi = aj + ai[i]; 1934 nz = diag[i] - ai[i]; 1935 idx += 3; 1936 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 1937 while (nz--) { 1938 jdx = 3*(*vi++); 1939 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 1940 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1941 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1942 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1943 v += 9; 1944 } 1945 x[idx] = s1; 1946 x[1+idx] = s2; 1947 x[2+idx] = s3; 1948 } 1949 /* backward solve the upper triangular */ 1950 for (i=n-1; i>=0; i--){ 1951 v = aa + 9*diag[i] + 9; 1952 vi = aj + diag[i] + 1; 1953 nz = ai[i+1] - diag[i] - 1; 1954 idt = 3*i; 1955 s1 = x[idt]; s2 = x[1+idt]; 1956 s3 = x[2+idt]; 1957 while (nz--) { 1958 idx = 3*(*vi++); 1959 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 1960 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1961 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1962 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1963 v += 9; 1964 } 1965 v = aa + 9*diag[i]; 1966 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1967 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1968 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1969 } 1970 1971 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1972 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1973 PLogFlops(2*9*(a->nz) - 3*a->n); 1974 PetscFunctionReturn(0); 1975 } 1976 1977 #undef __FUNC__ 1978 #define __FUNC__ "MatSolve_SeqSBAIJ_2" 1979 int MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx) 1980 { 1981 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1982 IS iscol=a->col,isrow=a->row; 1983 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1984 int *diag = a->diag; 1985 MatScalar *aa=a->a,*v; 1986 Scalar *x,*b,s1,s2,x1,x2,*t; 1987 1988 PetscFunctionBegin; 1989 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1990 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1991 t = a->solve_work; 1992 1993 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1994 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1995 1996 /* forward solve the lower triangular */ 1997 idx = 2*(*r++); 1998 t[0] = b[idx]; t[1] = b[1+idx]; 1999 for (i=1; i<n; i++) { 2000 v = aa + 4*ai[i]; 2001 vi = aj + ai[i]; 2002 nz = diag[i] - ai[i]; 2003 idx = 2*(*r++); 2004 s1 = b[idx]; s2 = b[1+idx]; 2005 while (nz--) { 2006 idx = 2*(*vi++); 2007 x1 = t[idx]; x2 = t[1+idx]; 2008 s1 -= v[0]*x1 + v[2]*x2; 2009 s2 -= v[1]*x1 + v[3]*x2; 2010 v += 4; 2011 } 2012 idx = 2*i; 2013 t[idx] = s1; t[1+idx] = s2; 2014 } 2015 /* backward solve the upper triangular */ 2016 for (i=n-1; i>=0; i--){ 2017 v = aa + 4*diag[i] + 4; 2018 vi = aj + diag[i] + 1; 2019 nz = ai[i+1] - diag[i] - 1; 2020 idt = 2*i; 2021 s1 = t[idt]; s2 = t[1+idt]; 2022 while (nz--) { 2023 idx = 2*(*vi++); 2024 x1 = t[idx]; x2 = t[1+idx]; 2025 s1 -= v[0]*x1 + v[2]*x2; 2026 s2 -= v[1]*x1 + v[3]*x2; 2027 v += 4; 2028 } 2029 idc = 2*(*c--); 2030 v = aa + 4*diag[i]; 2031 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2032 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2033 } 2034 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2035 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2036 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2037 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2038 PLogFlops(2*4*(a->nz) - 2*a->n); 2039 PetscFunctionReturn(0); 2040 } 2041 2042 /* 2043 Special case where the matrix was ILU(0) factored in the natural 2044 ordering. This eliminates the need for the column and row permutation. 2045 */ 2046 #undef __FUNC__ 2047 #define __FUNC__ "MatSolve_SeqSBAIJ_2_NaturalOrdering" 2048 int MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2049 { 2050 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2051 int n=a->mbs,*ai=a->i,*aj=a->j; 2052 int ierr,*diag = a->diag; 2053 MatScalar *aa=a->a,*v; 2054 Scalar *x,*b,s1,s2,x1,x2; 2055 int jdx,idt,idx,nz,*vi,i; 2056 2057 PetscFunctionBegin; 2058 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2059 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2060 2061 /* forward solve the lower triangular */ 2062 idx = 0; 2063 x[0] = b[0]; x[1] = b[1]; 2064 for (i=1; i<n; i++) { 2065 v = aa + 4*ai[i]; 2066 vi = aj + ai[i]; 2067 nz = diag[i] - ai[i]; 2068 idx += 2; 2069 s1 = b[idx];s2 = b[1+idx]; 2070 while (nz--) { 2071 jdx = 2*(*vi++); 2072 x1 = x[jdx];x2 = x[1+jdx]; 2073 s1 -= v[0]*x1 + v[2]*x2; 2074 s2 -= v[1]*x1 + v[3]*x2; 2075 v += 4; 2076 } 2077 x[idx] = s1; 2078 x[1+idx] = s2; 2079 } 2080 /* backward solve the upper triangular */ 2081 for (i=n-1; i>=0; i--){ 2082 v = aa + 4*diag[i] + 4; 2083 vi = aj + diag[i] + 1; 2084 nz = ai[i+1] - diag[i] - 1; 2085 idt = 2*i; 2086 s1 = x[idt]; s2 = x[1+idt]; 2087 while (nz--) { 2088 idx = 2*(*vi++); 2089 x1 = x[idx]; x2 = x[1+idx]; 2090 s1 -= v[0]*x1 + v[2]*x2; 2091 s2 -= v[1]*x1 + v[3]*x2; 2092 v += 4; 2093 } 2094 v = aa + 4*diag[i]; 2095 x[idt] = v[0]*s1 + v[2]*s2; 2096 x[1+idt] = v[1]*s1 + v[3]*s2; 2097 } 2098 2099 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2100 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2101 PLogFlops(2*4*(a->nz) - 2*a->n); 2102 PetscFunctionReturn(0); 2103 } 2104 2105 /*----------- New 1 --------------*/ 2106 #undef __FUNC__ 2107 #define __FUNC__ "MatSolve_SeqSBAIJ_1" 2108 int MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 2109 { 2110 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2111 IS isrow=a->row; 2112 int mbs=a->mbs,*ai=a->i,*aj=a->j,ierr,*rip; 2113 MatScalar *aa=a->a,*v; 2114 Scalar *x,*b,xk,*xtmp; 2115 int nz,*vj,k; 2116 2117 PetscFunctionBegin; 2118 if (!mbs) PetscFunctionReturn(0); 2119 2120 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2121 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2122 xtmp = a->solve_work; 2123 2124 ierr = ISGetIndices(isrow,&rip);CHKERRQ(ierr); 2125 2126 /* solve U'*D*y = b by forward substitution */ 2127 for (k=0; k<mbs; k++) xtmp[k] = b[rip[k]]; 2128 for (k=0; k<mbs; k++){ 2129 v = aa + ai[k]; 2130 vj = aj + ai[k]; 2131 xk = xtmp[k]; 2132 nz = ai[k+1] - ai[k]; 2133 while (nz--) xtmp[*vj++] += (*v++) * xk; 2134 xtmp[k] = xk*aa[k]; /* note: aa[k] = 1/D(k) */ 2135 } 2136 2137 /* solve U*x = y by back substitution */ 2138 2139 for (k=mbs-1; k>=0; k--){ 2140 v = aa + ai[k]; 2141 vj = aj + ai[k]; 2142 xk = xtmp[k]; 2143 nz = ai[k+1] - ai[k]; 2144 while (nz--) xk += (*v++) * xtmp[*vj++]; 2145 xtmp[k] = xk; 2146 x[rip[k]] = xk; 2147 } 2148 2149 ierr = ISRestoreIndices(isrow,&rip);CHKERRQ(ierr); 2150 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2151 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2152 PLogFlops(4*a->s_nz + a->m); 2153 PetscFunctionReturn(0); 2154 } 2155 2156 2157 #ifdef CONT 2158 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2159 IS iscol=a->col,isrow=a->row; 2160 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2161 int *diag = a->diag; 2162 MatScalar *aa=a->a,*v; 2163 Scalar *x,*b,s1,*t; 2164 2165 PetscFunctionBegin; 2166 if (!n) PetscFunctionReturn(0); 2167 2168 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2169 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2170 t = a->solve_work; 2171 2172 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2173 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2174 2175 /* forward solve the lower triangular */ 2176 t[0] = b[*r++]; 2177 for (i=1; i<n; i++) { 2178 v = aa + ai[i]; 2179 vi = aj + ai[i]; 2180 nz = diag[i] - ai[i]; 2181 s1 = b[*r++]; 2182 while (nz--) { 2183 s1 -= (*v++)*t[*vi++]; 2184 } 2185 t[i] = s1; 2186 } 2187 /* backward solve the upper triangular */ 2188 for (i=n-1; i>=0; i--){ 2189 v = aa + diag[i] + 1; 2190 vi = aj + diag[i] + 1; 2191 nz = ai[i+1] - diag[i] - 1; 2192 s1 = t[i]; 2193 while (nz--) { 2194 s1 -= (*v++)*t[*vi++]; 2195 } 2196 x[*c--] = t[i] = aa[diag[i]]*s1; 2197 } 2198 2199 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2200 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2201 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2202 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2203 PLogFlops(2*1*(a->nz) - a->n); 2204 PetscFunctionReturn(0); 2205 } 2206 2207 #endif 2208 /*----------- New 1 --------------*/ 2209 /* 2210 Special case where the matrix was ILU(0) factored in the natural 2211 ordering. This eliminates the need for the column and row permutation. 2212 */ 2213 #undef __FUNC__ 2214 #define __FUNC__ "MatSolve_SeqSBAIJ_1_NaturalOrdering" 2215 int MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2216 { 2217 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2218 int mbs=a->mbs,*ai=a->i,*aj=a->j,ierr; 2219 MatScalar *aa=a->a,*v; 2220 Scalar *x,*b,xk; 2221 int nz,*vj,k; 2222 2223 PetscFunctionBegin; 2224 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2225 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2226 2227 /* solve U'*D*y = b by forward substitution */ 2228 for (k=0; k<mbs; k++) x[k] = b[k]; 2229 for (k=0; k<mbs; k++){ 2230 v = aa + ai[k]; 2231 vj = aj + ai[k]; 2232 xk = x[k]; 2233 nz = ai[k+1] - ai[k]; 2234 while (nz--) x[*vj++] += (*v++) * xk; 2235 x[k] = xk*aa[k]; /* note: aa[k] = 1/D(k) */ 2236 } 2237 2238 /* solve U*x = y by back substitution */ 2239 for (k=mbs-2; k>=0; k--){ 2240 v = aa + ai[k]; 2241 vj = aj + ai[k]; 2242 xk = x[k]; 2243 nz = ai[k+1] - ai[k]; 2244 while (nz--) xk += (*v++) * x[*vj++]; 2245 x[k] = xk; 2246 } 2247 2248 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2249 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2250 PLogFlops(4*a->s_nz + a->m); 2251 PetscFunctionReturn(0); 2252 } 2253 2254 /* ----------------------------------------------------------------*/ 2255 #undef __FUNC__ 2256 #define __FUNC__ "MatILUFactorSymbolic_SeqSBAIJ" 2257 int MatILUFactorSymbolic_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *B) 2258 { 2259 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2260 IS isicol; 2261 int *rip,*riip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j; 2262 int *jutmp,bs = a->bs,bs2=a->bs2; 2263 int m,nzi,realloc = 0,*levtmp; 2264 int *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 2265 int levels,incrlev,*lev,lev_ik,shift; 2266 PetscReal f; 2267 2268 PetscFunctionBegin; 2269 if (info) { 2270 f = info->fill; 2271 levels = (int)info->levels; 2272 } else { 2273 f = 1.0; 2274 levels = 0; 2275 } 2276 PetscValidHeaderSpecific(isrow,IS_COOKIE); 2277 PetscValidHeaderSpecific(iscol,IS_COOKIE); 2278 /* if (A->M != A->N) SETERRQ(PETSC_ERR_ARG_WRONG,0,"matrix must be square");*/ 2279 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 2280 ierr = ISGetIndices(isrow,&rip);CHKERRQ(ierr); 2281 ierr = ISGetIndices(isicol,&riip);CHKERRQ(ierr); 2282 for (k=0; k<mbs; k++) { 2283 if ( rip[k] - riip[k] != 0 ) { 2284 printf("Non-symm. permutation, use symm. permutation or general matrix format\n"); 2285 break; 2286 } 2287 } 2288 2289 /* initialization */ 2290 /* Don't know how many column pointers are needed so estimate. 2291 Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */ 2292 2293 umax = (int)(f*ai[mbs] + 1); 2294 lev = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(lev); 2295 umax += mbs + 1; 2296 shift = mbs + 1; 2297 ju = iu = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(ju); 2298 iu[0] = mbs+1; 2299 juptr = mbs; 2300 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 2301 q = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(q); 2302 levtmp = (int*)PetscMalloc((mbs+1)*sizeof(int));CHKPTRQ(levtmp); 2303 for (i=0; i<mbs; i++){ 2304 jl[i] = mbs; q[i] = 0; 2305 } 2306 2307 /* for each row k */ 2308 for (k=0; k<mbs; k++){ 2309 nzk = 0; 2310 q[k] = mbs; 2311 /* initialize nonzero structure of k-th row to row rip[k] of A */ 2312 jmin = ai[rip[k]]; 2313 jmax = ai[rip[k]+1]; 2314 for (j=jmin; j<jmax; j++){ 2315 vj = riip[aj[j]]; 2316 if(vj > k){ 2317 qm = k; 2318 do { 2319 m = qm; qm = q[m]; 2320 } while(qm < vj); 2321 if (qm == vj) { 2322 printf(" error: duplicate entry in A\n"); break; 2323 } 2324 nzk++; 2325 q[m] = vj; 2326 q[vj] = qm; 2327 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2328 } /* if(vj > k) */ 2329 } /* for (j=jmin; j<jmax; j++) */ 2330 2331 /* modify nonzero structure of k-th row by computing fill-in 2332 for each row i to be merged in */ 2333 i = k; 2334 i = jl[i]; /* next pivot row (== 0 for symbolic factorization) */ 2335 /* printf(" next pivot row i=%d\n",i); */ 2336 while (i < mbs){ 2337 /* merge row i into k-th row */ 2338 j=iu[i]; 2339 vj = ju[j]; 2340 lev_ik = lev[j-shift]; 2341 nzi = iu[i+1] - (iu[i]+1); 2342 jmin = iu[i] + 1; jmax = iu[i] + nzi; 2343 qm = k; 2344 for (j=jmin; j<jmax+1; j++){ 2345 vj = ju[j]; 2346 incrlev = lev[j-shift]+lev_ik+1; 2347 2348 if (incrlev > levels) continue; 2349 do { 2350 m = qm; qm = q[m]; 2351 } while (qm < vj); 2352 if (qm != vj){ /* a fill */ 2353 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 2354 levtmp[vj] = incrlev; 2355 } 2356 else { /* already a nonzero element */ 2357 if (levtmp[vj]>incrlev) levtmp[vj] = incrlev; 2358 } 2359 } 2360 i = jl[i]; /* next pivot row */ 2361 } 2362 2363 /* add k to row list for first nonzero element in k-th row */ 2364 if (nzk > 1){ 2365 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2366 jl[k] = jl[i]; jl[i] = k; 2367 } 2368 iu[k+1] = iu[k] + nzk; /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/ 2369 2370 /* allocate more space to ju and lev if needed */ 2371 if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax); 2372 /* estimate how much additional space we will need */ 2373 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2374 /* just double the memory each time */ 2375 maxadd = umax; 2376 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 2377 umax += maxadd; 2378 2379 /* allocate a longer ju (NOTE: iu poits to the beginning of ju) */ 2380 jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp); 2381 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr); 2382 ierr = PetscFree(ju);CHKERRQ(ierr); 2383 ju = iu = jutmp; 2384 2385 jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp); 2386 ierr = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(int));CHKERRQ(ierr); 2387 ierr = PetscFree(lev);CHKERRQ(ierr); 2388 lev = jutmp; 2389 2390 realloc += 2; /* count how many times we realloc */ 2391 } 2392 2393 /* save nonzero structure of k-th row in ju */ 2394 i=k; 2395 jumin = juptr + 1; juptr += nzk; 2396 for (j=jumin; j<juptr+1; j++){ 2397 i = q[i]; 2398 ju[j] = i; 2399 lev[j-shift] = levtmp[i]; 2400 /* printf(" k=%d, ju[%d]=%d\n",k,j,ju[j]);*/ 2401 } 2402 /* printf("\n"); */ 2403 } /* for (k=0; k<mbs; k++) */ 2404 2405 if (ai[mbs] != 0) { 2406 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2407 PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 2408 PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af); 2409 PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af); 2410 PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:for best performance.\n"); 2411 } else { 2412 PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Empty matrix.\n"); 2413 } 2414 2415 ierr = ISRestoreIndices(isrow,&rip);CHKERRQ(ierr); 2416 ierr = ISRestoreIndices(isicol,&riip);CHKERRQ(ierr); 2417 2418 ierr = PetscFree(q);CHKERRQ(ierr); 2419 ierr = PetscFree(jl);CHKERRQ(ierr); 2420 2421 /* put together the new matrix */ 2422 ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr); 2423 PLogObjectParent(*B,isicol); 2424 b = (Mat_SeqSBAIJ*)(*B)->data; 2425 ierr = PetscFree(b->imax);CHKERRQ(ierr); 2426 b->singlemalloc = PETSC_FALSE; 2427 /* the next line frees the default space generated by the Create() */ 2428 ierr = PetscFree(b->a);CHKERRQ(ierr); 2429 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 2430 b->a = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a); 2431 b->j = ju; 2432 b->i = iu; 2433 b->diag = 0; 2434 b->ilen = 0; 2435 b->imax = 0; 2436 b->row = isrow; 2437 b->col = iscol; 2438 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 2439 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 2440 b->icol = isicol; 2441 b->solve_work = (Scalar*)PetscMalloc((bs*mbs+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work); 2442 /* In b structure: Free imax, ilen, old a, old j. 2443 Allocate idnew, solve_work, new a, new j */ 2444 PLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar))); 2445 b->s_maxnz = b->s_nz = iu[mbs]; 2446 2447 (*B)->factor = FACTOR_LU; 2448 (*B)->info.factor_mallocs = realloc; 2449 (*B)->info.fill_ratio_given = f; 2450 if (ai[mbs] != 0) { 2451 (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2452 } else { 2453 (*B)->info.fill_ratio_needed = 0.0; 2454 } 2455 2456 PetscFunctionReturn(0); 2457 } 2458 2459 2460 2461