1 /*$Id: sbaijfact2.c,v 1.2 2000/06/25 17:07:00 balay Exp hzhang $*/ 2 /* 3 Factorization code for SBAIJ format. 4 */ 5 6 #include "sbaij.h" 7 #include "src/mat/impls/baij/seq/baij.h" 8 #include "src/vec/vecimpl.h" 9 #include "src/inline/ilu.h" 10 #include "src/inline/dot.h" 11 12 #undef __FUNC__ 13 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering" 14 int MatSolveTranspose_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 15 { 16 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 17 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz; 18 int *diag = a->diag; 19 MatScalar *aa=a->a,*v; 20 Scalar s1,*x,*b; 21 22 PetscFunctionBegin; 23 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 24 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 25 26 /* forward solve the U^T */ 27 for (i=0; i<n; i++) { 28 29 v = aa + diag[i]; 30 /* multiply by the inverse of the block diagonal */ 31 s1 = (*v++)*b[i]; 32 vi = aj + diag[i] + 1; 33 nz = ai[i+1] - diag[i] - 1; 34 while (nz--) { 35 x[*vi++] -= (*v++)*s1; 36 } 37 x[i] = s1; 38 } 39 /* backward solve the L^T */ 40 for (i=n-1; i>=0; i--){ 41 v = aa + diag[i] - 1; 42 vi = aj + diag[i] - 1; 43 nz = diag[i] - ai[i]; 44 s1 = x[i]; 45 while (nz--) { 46 x[*vi--] -= (*v--)*s1; 47 } 48 } 49 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 50 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 51 PLogFlops(2*(a->nz) - a->n); 52 PetscFunctionReturn(0); 53 } 54 55 #undef __FUNC__ 56 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering" 57 int MatSolveTranspose_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 58 { 59 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 60 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 61 int *diag = a->diag,oidx; 62 MatScalar *aa=a->a,*v; 63 Scalar s1,s2,x1,x2; 64 Scalar *x,*b; 65 66 PetscFunctionBegin; 67 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 68 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 69 70 /* forward solve the U^T */ 71 idx = 0; 72 for (i=0; i<n; i++) { 73 74 v = aa + 4*diag[i]; 75 /* multiply by the inverse of the block diagonal */ 76 x1 = b[idx]; x2 = b[1+idx]; 77 s1 = v[0]*x1 + v[1]*x2; 78 s2 = v[2]*x1 + v[3]*x2; 79 v += 4; 80 81 vi = aj + diag[i] + 1; 82 nz = ai[i+1] - diag[i] - 1; 83 while (nz--) { 84 oidx = 2*(*vi++); 85 x[oidx] -= v[0]*s1 + v[1]*s2; 86 x[oidx+1] -= v[2]*s1 + v[3]*s2; 87 v += 4; 88 } 89 x[idx] = s1;x[1+idx] = s2; 90 idx += 2; 91 } 92 /* backward solve the L^T */ 93 for (i=n-1; i>=0; i--){ 94 v = aa + 4*diag[i] - 4; 95 vi = aj + diag[i] - 1; 96 nz = diag[i] - ai[i]; 97 idt = 2*i; 98 s1 = x[idt]; s2 = x[1+idt]; 99 while (nz--) { 100 idx = 2*(*vi--); 101 x[idx] -= v[0]*s1 + v[1]*s2; 102 x[idx+1] -= v[2]*s1 + v[3]*s2; 103 v -= 4; 104 } 105 } 106 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 107 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 108 PLogFlops(2*4*(a->nz) - 2*a->n); 109 PetscFunctionReturn(0); 110 } 111 112 #undef __FUNC__ 113 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering" 114 int MatSolveTranspose_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 115 { 116 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 117 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 118 int *diag = a->diag,oidx; 119 MatScalar *aa=a->a,*v; 120 Scalar s1,s2,s3,x1,x2,x3; 121 Scalar *x,*b; 122 123 PetscFunctionBegin; 124 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 125 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 126 127 /* forward solve the U^T */ 128 idx = 0; 129 for (i=0; i<n; i++) { 130 131 v = aa + 9*diag[i]; 132 /* multiply by the inverse of the block diagonal */ 133 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; 134 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 135 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 136 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 137 v += 9; 138 139 vi = aj + diag[i] + 1; 140 nz = ai[i+1] - diag[i] - 1; 141 while (nz--) { 142 oidx = 3*(*vi++); 143 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 144 x[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 145 x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 146 v += 9; 147 } 148 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3; 149 idx += 3; 150 } 151 /* backward solve the L^T */ 152 for (i=n-1; i>=0; i--){ 153 v = aa + 9*diag[i] - 9; 154 vi = aj + diag[i] - 1; 155 nz = diag[i] - ai[i]; 156 idt = 3*i; 157 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt]; 158 while (nz--) { 159 idx = 3*(*vi--); 160 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 161 x[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 162 x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 163 v -= 9; 164 } 165 } 166 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 167 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 168 PLogFlops(2*9*(a->nz) - 3*a->n); 169 PetscFunctionReturn(0); 170 } 171 172 #undef __FUNC__ 173 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering" 174 int MatSolveTranspose_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 175 { 176 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 177 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 178 int *diag = a->diag,oidx; 179 MatScalar *aa=a->a,*v; 180 Scalar s1,s2,s3,s4,x1,x2,x3,x4; 181 Scalar *x,*b; 182 183 PetscFunctionBegin; 184 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 185 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 186 187 /* forward solve the U^T */ 188 idx = 0; 189 for (i=0; i<n; i++) { 190 191 v = aa + 16*diag[i]; 192 /* multiply by the inverse of the block diagonal */ 193 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; 194 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 195 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 196 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 197 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 198 v += 16; 199 200 vi = aj + diag[i] + 1; 201 nz = ai[i+1] - diag[i] - 1; 202 while (nz--) { 203 oidx = 4*(*vi++); 204 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 205 x[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 206 x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 207 x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 208 v += 16; 209 } 210 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; 211 idx += 4; 212 } 213 /* backward solve the L^T */ 214 for (i=n-1; i>=0; i--){ 215 v = aa + 16*diag[i] - 16; 216 vi = aj + diag[i] - 1; 217 nz = diag[i] - ai[i]; 218 idt = 4*i; 219 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; 220 while (nz--) { 221 idx = 4*(*vi--); 222 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 223 x[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 224 x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 225 x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 226 v -= 16; 227 } 228 } 229 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 230 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 231 PLogFlops(2*16*(a->nz) - 4*a->n); 232 PetscFunctionReturn(0); 233 } 234 235 #undef __FUNC__ 236 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering" 237 int MatSolveTranspose_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 238 { 239 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 240 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 241 int *diag = a->diag,oidx; 242 MatScalar *aa=a->a,*v; 243 Scalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 244 Scalar *x,*b; 245 246 PetscFunctionBegin; 247 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 248 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 249 250 /* forward solve the U^T */ 251 idx = 0; 252 for (i=0; i<n; i++) { 253 254 v = aa + 25*diag[i]; 255 /* multiply by the inverse of the block diagonal */ 256 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx]; 257 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 258 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 259 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 260 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 261 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 262 v += 25; 263 264 vi = aj + diag[i] + 1; 265 nz = ai[i+1] - diag[i] - 1; 266 while (nz--) { 267 oidx = 5*(*vi++); 268 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 269 x[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 270 x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 271 x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 272 x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 273 v += 25; 274 } 275 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 276 idx += 5; 277 } 278 /* backward solve the L^T */ 279 for (i=n-1; i>=0; i--){ 280 v = aa + 25*diag[i] - 25; 281 vi = aj + diag[i] - 1; 282 nz = diag[i] - ai[i]; 283 idt = 5*i; 284 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 285 while (nz--) { 286 idx = 5*(*vi--); 287 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 288 x[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 289 x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 290 x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 291 x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 292 v -= 25; 293 } 294 } 295 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 296 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 297 PLogFlops(2*25*(a->nz) - 5*a->n); 298 PetscFunctionReturn(0); 299 } 300 301 #undef __FUNC__ 302 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering" 303 int MatSolveTranspose_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 304 { 305 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 306 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 307 int *diag = a->diag,oidx; 308 MatScalar *aa=a->a,*v; 309 Scalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 310 Scalar *x,*b; 311 312 PetscFunctionBegin; 313 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 314 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 315 316 /* forward solve the U^T */ 317 idx = 0; 318 for (i=0; i<n; i++) { 319 320 v = aa + 36*diag[i]; 321 /* multiply by the inverse of the block diagonal */ 322 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx]; 323 x6 = b[5+idx]; 324 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 325 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 326 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 327 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 328 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 329 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 330 v += 36; 331 332 vi = aj + diag[i] + 1; 333 nz = ai[i+1] - diag[i] - 1; 334 while (nz--) { 335 oidx = 6*(*vi++); 336 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 337 x[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 338 x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 339 x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 340 x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 341 x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 342 v += 36; 343 } 344 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 345 x[5+idx] = s6; 346 idx += 6; 347 } 348 /* backward solve the L^T */ 349 for (i=n-1; i>=0; i--){ 350 v = aa + 36*diag[i] - 36; 351 vi = aj + diag[i] - 1; 352 nz = diag[i] - ai[i]; 353 idt = 6*i; 354 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 355 s6 = x[5+idt]; 356 while (nz--) { 357 idx = 6*(*vi--); 358 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 359 x[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 360 x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 361 x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 362 x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 363 x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 364 v -= 36; 365 } 366 } 367 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 368 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 369 PLogFlops(2*36*(a->nz) - 6*a->n); 370 PetscFunctionReturn(0); 371 } 372 373 #undef __FUNC__ 374 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering" 375 int MatSolveTranspose_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 376 { 377 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 378 int ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 379 int *diag = a->diag,oidx; 380 MatScalar *aa=a->a,*v; 381 Scalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 382 Scalar *x,*b; 383 384 PetscFunctionBegin; 385 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 386 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 387 388 /* forward solve the U^T */ 389 idx = 0; 390 for (i=0; i<n; i++) { 391 392 v = aa + 49*diag[i]; 393 /* multiply by the inverse of the block diagonal */ 394 x1 = b[idx]; x2 = b[1+idx]; x3 = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx]; 395 x6 = b[5+idx]; x7 = b[6+idx]; 396 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 397 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 398 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 399 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 400 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 401 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 402 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 403 v += 49; 404 405 vi = aj + diag[i] + 1; 406 nz = ai[i+1] - diag[i] - 1; 407 while (nz--) { 408 oidx = 7*(*vi++); 409 x[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 410 x[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 411 x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 412 x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 413 x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 414 x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 415 x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 416 v += 49; 417 } 418 x[idx] = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5; 419 x[5+idx] = s6;x[6+idx] = s7; 420 idx += 7; 421 } 422 /* backward solve the L^T */ 423 for (i=n-1; i>=0; i--){ 424 v = aa + 49*diag[i] - 49; 425 vi = aj + diag[i] - 1; 426 nz = diag[i] - ai[i]; 427 idt = 7*i; 428 s1 = x[idt]; s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 429 s6 = x[5+idt];s7 = x[6+idt]; 430 while (nz--) { 431 idx = 7*(*vi--); 432 x[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 433 x[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 434 x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 435 x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 436 x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 437 x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 438 x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 439 v -= 49; 440 } 441 } 442 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 443 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 444 PLogFlops(2*49*(a->nz) - 7*a->n); 445 PetscFunctionReturn(0); 446 } 447 448 #undef __FUNC__ 449 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_1" 450 int MatSolveTranspose_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 451 { 452 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 453 IS iscol=a->col,isrow=a->row; 454 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 455 int *diag = a->diag; 456 MatScalar *aa=a->a,*v; 457 Scalar s1,*x,*b,*t; 458 459 PetscFunctionBegin; 460 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 461 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 462 t = a->solve_work; 463 464 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 465 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 466 467 /* copy the b into temp work space according to permutation */ 468 for (i=0; i<n; i++) { 469 t[i] = b[c[i]]; 470 } 471 472 /* forward solve the U^T */ 473 for (i=0; i<n; i++) { 474 475 v = aa + diag[i]; 476 /* multiply by the inverse of the block diagonal */ 477 s1 = (*v++)*t[i]; 478 vi = aj + diag[i] + 1; 479 nz = ai[i+1] - diag[i] - 1; 480 while (nz--) { 481 t[*vi++] -= (*v++)*s1; 482 } 483 t[i] = s1; 484 } 485 /* backward solve the L^T */ 486 for (i=n-1; i>=0; i--){ 487 v = aa + diag[i] - 1; 488 vi = aj + diag[i] - 1; 489 nz = diag[i] - ai[i]; 490 s1 = t[i]; 491 while (nz--) { 492 t[*vi--] -= (*v--)*s1; 493 } 494 } 495 496 /* copy t into x according to permutation */ 497 for (i=0; i<n; i++) { 498 x[r[i]] = t[i]; 499 } 500 501 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 502 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 503 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 504 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 505 PLogFlops(2*(a->nz) - a->n); 506 PetscFunctionReturn(0); 507 } 508 509 #undef __FUNC__ 510 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_2" 511 int MatSolveTranspose_SeqSBAIJ_2(Mat A,Vec bb,Vec xx) 512 { 513 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 514 IS iscol=a->col,isrow=a->row; 515 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 516 int *diag = a->diag,ii,ic,ir,oidx; 517 MatScalar *aa=a->a,*v; 518 Scalar s1,s2,x1,x2; 519 Scalar *x,*b,*t; 520 521 PetscFunctionBegin; 522 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 523 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 524 t = a->solve_work; 525 526 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 527 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 528 529 /* copy the b into temp work space according to permutation */ 530 ii = 0; 531 for (i=0; i<n; i++) { 532 ic = 2*c[i]; 533 t[ii] = b[ic]; 534 t[ii+1] = b[ic+1]; 535 ii += 2; 536 } 537 538 /* forward solve the U^T */ 539 idx = 0; 540 for (i=0; i<n; i++) { 541 542 v = aa + 4*diag[i]; 543 /* multiply by the inverse of the block diagonal */ 544 x1 = t[idx]; x2 = t[1+idx]; 545 s1 = v[0]*x1 + v[1]*x2; 546 s2 = v[2]*x1 + v[3]*x2; 547 v += 4; 548 549 vi = aj + diag[i] + 1; 550 nz = ai[i+1] - diag[i] - 1; 551 while (nz--) { 552 oidx = 2*(*vi++); 553 t[oidx] -= v[0]*s1 + v[1]*s2; 554 t[oidx+1] -= v[2]*s1 + v[3]*s2; 555 v += 4; 556 } 557 t[idx] = s1;t[1+idx] = s2; 558 idx += 2; 559 } 560 /* backward solve the L^T */ 561 for (i=n-1; i>=0; i--){ 562 v = aa + 4*diag[i] - 4; 563 vi = aj + diag[i] - 1; 564 nz = diag[i] - ai[i]; 565 idt = 2*i; 566 s1 = t[idt]; s2 = t[1+idt]; 567 while (nz--) { 568 idx = 2*(*vi--); 569 t[idx] -= v[0]*s1 + v[1]*s2; 570 t[idx+1] -= v[2]*s1 + v[3]*s2; 571 v -= 4; 572 } 573 } 574 575 /* copy t into x according to permutation */ 576 ii = 0; 577 for (i=0; i<n; i++) { 578 ir = 2*r[i]; 579 x[ir] = t[ii]; 580 x[ir+1] = t[ii+1]; 581 ii += 2; 582 } 583 584 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 585 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 586 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 587 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 588 PLogFlops(2*4*(a->nz) - 2*a->n); 589 PetscFunctionReturn(0); 590 } 591 592 #undef __FUNC__ 593 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_3" 594 int MatSolveTranspose_SeqSBAIJ_3(Mat A,Vec bb,Vec xx) 595 { 596 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 597 IS iscol=a->col,isrow=a->row; 598 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 599 int *diag = a->diag,ii,ic,ir,oidx; 600 MatScalar *aa=a->a,*v; 601 Scalar s1,s2,s3,x1,x2,x3; 602 Scalar *x,*b,*t; 603 604 PetscFunctionBegin; 605 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 606 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 607 t = a->solve_work; 608 609 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 610 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 611 612 /* copy the b into temp work space according to permutation */ 613 ii = 0; 614 for (i=0; i<n; i++) { 615 ic = 3*c[i]; 616 t[ii] = b[ic]; 617 t[ii+1] = b[ic+1]; 618 t[ii+2] = b[ic+2]; 619 ii += 3; 620 } 621 622 /* forward solve the U^T */ 623 idx = 0; 624 for (i=0; i<n; i++) { 625 626 v = aa + 9*diag[i]; 627 /* multiply by the inverse of the block diagonal */ 628 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 629 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3; 630 s2 = v[3]*x1 + v[4]*x2 + v[5]*x3; 631 s3 = v[6]*x1 + v[7]*x2 + v[8]*x3; 632 v += 9; 633 634 vi = aj + diag[i] + 1; 635 nz = ai[i+1] - diag[i] - 1; 636 while (nz--) { 637 oidx = 3*(*vi++); 638 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 639 t[oidx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 640 t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 641 v += 9; 642 } 643 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3; 644 idx += 3; 645 } 646 /* backward solve the L^T */ 647 for (i=n-1; i>=0; i--){ 648 v = aa + 9*diag[i] - 9; 649 vi = aj + diag[i] - 1; 650 nz = diag[i] - ai[i]; 651 idt = 3*i; 652 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 653 while (nz--) { 654 idx = 3*(*vi--); 655 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3; 656 t[idx+1] -= v[3]*s1 + v[4]*s2 + v[5]*s3; 657 t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3; 658 v -= 9; 659 } 660 } 661 662 /* copy t into x according to permutation */ 663 ii = 0; 664 for (i=0; i<n; i++) { 665 ir = 3*r[i]; 666 x[ir] = t[ii]; 667 x[ir+1] = t[ii+1]; 668 x[ir+2] = t[ii+2]; 669 ii += 3; 670 } 671 672 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 673 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 674 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 675 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 676 PLogFlops(2*9*(a->nz) - 3*a->n); 677 PetscFunctionReturn(0); 678 } 679 680 #undef __FUNC__ 681 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_4" 682 int MatSolveTranspose_SeqSBAIJ_4(Mat A,Vec bb,Vec xx) 683 { 684 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 685 IS iscol=a->col,isrow=a->row; 686 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 687 int *diag = a->diag,ii,ic,ir,oidx; 688 MatScalar *aa=a->a,*v; 689 Scalar s1,s2,s3,s4,x1,x2,x3,x4; 690 Scalar *x,*b,*t; 691 692 PetscFunctionBegin; 693 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 694 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 695 t = a->solve_work; 696 697 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 698 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 699 700 /* copy the b into temp work space according to permutation */ 701 ii = 0; 702 for (i=0; i<n; i++) { 703 ic = 4*c[i]; 704 t[ii] = b[ic]; 705 t[ii+1] = b[ic+1]; 706 t[ii+2] = b[ic+2]; 707 t[ii+3] = b[ic+3]; 708 ii += 4; 709 } 710 711 /* forward solve the U^T */ 712 idx = 0; 713 for (i=0; i<n; i++) { 714 715 v = aa + 16*diag[i]; 716 /* multiply by the inverse of the block diagonal */ 717 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; 718 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4; 719 s2 = v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4; 720 s3 = v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4; 721 s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4; 722 v += 16; 723 724 vi = aj + diag[i] + 1; 725 nz = ai[i+1] - diag[i] - 1; 726 while (nz--) { 727 oidx = 4*(*vi++); 728 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 729 t[oidx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 730 t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 731 t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 732 v += 16; 733 } 734 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; 735 idx += 4; 736 } 737 /* backward solve the L^T */ 738 for (i=n-1; i>=0; i--){ 739 v = aa + 16*diag[i] - 16; 740 vi = aj + diag[i] - 1; 741 nz = diag[i] - ai[i]; 742 idt = 4*i; 743 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; 744 while (nz--) { 745 idx = 4*(*vi--); 746 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4; 747 t[idx+1] -= v[4]*s1 + v[5]*s2 + v[6]*s3 + v[7]*s4; 748 t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4; 749 t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4; 750 v -= 16; 751 } 752 } 753 754 /* copy t into x according to permutation */ 755 ii = 0; 756 for (i=0; i<n; i++) { 757 ir = 4*r[i]; 758 x[ir] = t[ii]; 759 x[ir+1] = t[ii+1]; 760 x[ir+2] = t[ii+2]; 761 x[ir+3] = t[ii+3]; 762 ii += 4; 763 } 764 765 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 766 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 767 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 768 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 769 PLogFlops(2*16*(a->nz) - 4*a->n); 770 PetscFunctionReturn(0); 771 } 772 773 #undef __FUNC__ 774 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_5" 775 int MatSolveTranspose_SeqSBAIJ_5(Mat A,Vec bb,Vec xx) 776 { 777 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 778 IS iscol=a->col,isrow=a->row; 779 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 780 int *diag = a->diag,ii,ic,ir,oidx; 781 MatScalar *aa=a->a,*v; 782 Scalar s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 783 Scalar *x,*b,*t; 784 785 PetscFunctionBegin; 786 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 787 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 788 t = a->solve_work; 789 790 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 791 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 792 793 /* copy the b into temp work space according to permutation */ 794 ii = 0; 795 for (i=0; i<n; i++) { 796 ic = 5*c[i]; 797 t[ii] = b[ic]; 798 t[ii+1] = b[ic+1]; 799 t[ii+2] = b[ic+2]; 800 t[ii+3] = b[ic+3]; 801 t[ii+4] = b[ic+4]; 802 ii += 5; 803 } 804 805 /* forward solve the U^T */ 806 idx = 0; 807 for (i=0; i<n; i++) { 808 809 v = aa + 25*diag[i]; 810 /* multiply by the inverse of the block diagonal */ 811 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 812 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5; 813 s2 = v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5; 814 s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5; 815 s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5; 816 s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5; 817 v += 25; 818 819 vi = aj + diag[i] + 1; 820 nz = ai[i+1] - diag[i] - 1; 821 while (nz--) { 822 oidx = 5*(*vi++); 823 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 824 t[oidx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 825 t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 826 t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 827 t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 828 v += 25; 829 } 830 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 831 idx += 5; 832 } 833 /* backward solve the L^T */ 834 for (i=n-1; i>=0; i--){ 835 v = aa + 25*diag[i] - 25; 836 vi = aj + diag[i] - 1; 837 nz = diag[i] - ai[i]; 838 idt = 5*i; 839 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 840 while (nz--) { 841 idx = 5*(*vi--); 842 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5; 843 t[idx+1] -= v[5]*s1 + v[6]*s2 + v[7]*s3 + v[8]*s4 + v[9]*s5; 844 t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5; 845 t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5; 846 t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5; 847 v -= 25; 848 } 849 } 850 851 /* copy t into x according to permutation */ 852 ii = 0; 853 for (i=0; i<n; i++) { 854 ir = 5*r[i]; 855 x[ir] = t[ii]; 856 x[ir+1] = t[ii+1]; 857 x[ir+2] = t[ii+2]; 858 x[ir+3] = t[ii+3]; 859 x[ir+4] = t[ii+4]; 860 ii += 5; 861 } 862 863 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 864 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 865 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 866 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 867 PLogFlops(2*25*(a->nz) - 5*a->n); 868 PetscFunctionReturn(0); 869 } 870 871 #undef __FUNC__ 872 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_6" 873 int MatSolveTranspose_SeqSBAIJ_6(Mat A,Vec bb,Vec xx) 874 { 875 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 876 IS iscol=a->col,isrow=a->row; 877 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 878 int *diag = a->diag,ii,ic,ir,oidx; 879 MatScalar *aa=a->a,*v; 880 Scalar s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 881 Scalar *x,*b,*t; 882 883 PetscFunctionBegin; 884 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 885 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 886 t = a->solve_work; 887 888 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 889 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 890 891 /* copy the b into temp work space according to permutation */ 892 ii = 0; 893 for (i=0; i<n; i++) { 894 ic = 6*c[i]; 895 t[ii] = b[ic]; 896 t[ii+1] = b[ic+1]; 897 t[ii+2] = b[ic+2]; 898 t[ii+3] = b[ic+3]; 899 t[ii+4] = b[ic+4]; 900 t[ii+5] = b[ic+5]; 901 ii += 6; 902 } 903 904 /* forward solve the U^T */ 905 idx = 0; 906 for (i=0; i<n; i++) { 907 908 v = aa + 36*diag[i]; 909 /* multiply by the inverse of the block diagonal */ 910 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 911 x6 = t[5+idx]; 912 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6; 913 s2 = v[6]*x1 + v[7]*x2 + v[8]*x3 + v[9]*x4 + v[10]*x5 + v[11]*x6; 914 s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6; 915 s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6; 916 s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6; 917 s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6; 918 v += 36; 919 920 vi = aj + diag[i] + 1; 921 nz = ai[i+1] - diag[i] - 1; 922 while (nz--) { 923 oidx = 6*(*vi++); 924 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 925 t[oidx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 926 t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 927 t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 928 t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 929 t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 930 v += 36; 931 } 932 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 933 t[5+idx] = s6; 934 idx += 6; 935 } 936 /* backward solve the L^T */ 937 for (i=n-1; i>=0; i--){ 938 v = aa + 36*diag[i] - 36; 939 vi = aj + diag[i] - 1; 940 nz = diag[i] - ai[i]; 941 idt = 6*i; 942 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 943 s6 = t[5+idt]; 944 while (nz--) { 945 idx = 6*(*vi--); 946 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6; 947 t[idx+1] -= v[6]*s1 + v[7]*s2 + v[8]*s3 + v[9]*s4 + v[10]*s5 + v[11]*s6; 948 t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6; 949 t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6; 950 t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6; 951 t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6; 952 v -= 36; 953 } 954 } 955 956 /* copy t into x according to permutation */ 957 ii = 0; 958 for (i=0; i<n; i++) { 959 ir = 6*r[i]; 960 x[ir] = t[ii]; 961 x[ir+1] = t[ii+1]; 962 x[ir+2] = t[ii+2]; 963 x[ir+3] = t[ii+3]; 964 x[ir+4] = t[ii+4]; 965 x[ir+5] = t[ii+5]; 966 ii += 6; 967 } 968 969 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 970 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 971 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 972 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 973 PLogFlops(2*36*(a->nz) - 6*a->n); 974 PetscFunctionReturn(0); 975 } 976 977 #undef __FUNC__ 978 #define __FUNC__ "MatSolveTranspose_SeqSBAIJ_7" 979 int MatSolveTranspose_SeqSBAIJ_7(Mat A,Vec bb,Vec xx) 980 { 981 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 982 IS iscol=a->col,isrow=a->row; 983 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout; 984 int *diag = a->diag,ii,ic,ir,oidx; 985 MatScalar *aa=a->a,*v; 986 Scalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 987 Scalar *x,*b,*t; 988 989 PetscFunctionBegin; 990 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 991 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 992 t = a->solve_work; 993 994 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 995 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout; 996 997 /* copy the b into temp work space according to permutation */ 998 ii = 0; 999 for (i=0; i<n; i++) { 1000 ic = 7*c[i]; 1001 t[ii] = b[ic]; 1002 t[ii+1] = b[ic+1]; 1003 t[ii+2] = b[ic+2]; 1004 t[ii+3] = b[ic+3]; 1005 t[ii+4] = b[ic+4]; 1006 t[ii+5] = b[ic+5]; 1007 t[ii+6] = b[ic+6]; 1008 ii += 7; 1009 } 1010 1011 /* forward solve the U^T */ 1012 idx = 0; 1013 for (i=0; i<n; i++) { 1014 1015 v = aa + 49*diag[i]; 1016 /* multiply by the inverse of the block diagonal */ 1017 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1018 x6 = t[5+idx]; x7 = t[6+idx]; 1019 s1 = v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5 + v[5]*x6 + v[6]*x7; 1020 s2 = v[7]*x1 + v[8]*x2 + v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7; 1021 s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7; 1022 s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7; 1023 s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7; 1024 s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7; 1025 s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7; 1026 v += 49; 1027 1028 vi = aj + diag[i] + 1; 1029 nz = ai[i+1] - diag[i] - 1; 1030 while (nz--) { 1031 oidx = 7*(*vi++); 1032 t[oidx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1033 t[oidx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1034 t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1035 t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1036 t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1037 t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1038 t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1039 v += 49; 1040 } 1041 t[idx] = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1042 t[5+idx] = s6;t[6+idx] = s7; 1043 idx += 7; 1044 } 1045 /* backward solve the L^T */ 1046 for (i=n-1; i>=0; i--){ 1047 v = aa + 49*diag[i] - 49; 1048 vi = aj + diag[i] - 1; 1049 nz = diag[i] - ai[i]; 1050 idt = 7*i; 1051 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1052 s6 = t[5+idt];s7 = t[6+idt]; 1053 while (nz--) { 1054 idx = 7*(*vi--); 1055 t[idx] -= v[0]*s1 + v[1]*s2 + v[2]*s3 + v[3]*s4 + v[4]*s5 + v[5]*s6 + v[6]*s7; 1056 t[idx+1] -= v[7]*s1 + v[8]*s2 + v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7; 1057 t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7; 1058 t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7; 1059 t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7; 1060 t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7; 1061 t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7; 1062 v -= 49; 1063 } 1064 } 1065 1066 /* copy t into x according to permutation */ 1067 ii = 0; 1068 for (i=0; i<n; i++) { 1069 ir = 7*r[i]; 1070 x[ir] = t[ii]; 1071 x[ir+1] = t[ii+1]; 1072 x[ir+2] = t[ii+2]; 1073 x[ir+3] = t[ii+3]; 1074 x[ir+4] = t[ii+4]; 1075 x[ir+5] = t[ii+5]; 1076 x[ir+6] = t[ii+6]; 1077 ii += 7; 1078 } 1079 1080 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1081 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1082 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1083 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1084 PLogFlops(2*49*(a->nz) - 7*a->n); 1085 PetscFunctionReturn(0); 1086 } 1087 1088 /* ----------------------------------------------------------- */ 1089 #undef __FUNC__ 1090 #define __FUNC__ "MatSolve_SeqSBAIJ_N" 1091 int MatSolve_SeqSBAIJ_N(Mat A,Vec bb,Vec xx) 1092 { 1093 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1094 IS iscol=a->col,isrow=a->row; 1095 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j; 1096 int nz,bs=a->bs,bs2=a->bs2,*rout,*cout; 1097 MatScalar *aa=a->a,*v; 1098 Scalar *x,*b,*s,*t,*ls; 1099 1100 PetscFunctionBegin; 1101 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1102 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1103 t = a->solve_work; 1104 1105 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1106 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1107 1108 /* forward solve the lower triangular */ 1109 ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr); 1110 for (i=1; i<n; i++) { 1111 v = aa + bs2*ai[i]; 1112 vi = aj + ai[i]; 1113 nz = a->diag[i] - ai[i]; 1114 s = t + bs*i; 1115 ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr); 1116 while (nz--) { 1117 Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++)); 1118 v += bs2; 1119 } 1120 } 1121 /* backward solve the upper triangular */ 1122 ls = a->solve_work + a->n; 1123 for (i=n-1; i>=0; i--){ 1124 v = aa + bs2*(a->diag[i] + 1); 1125 vi = aj + a->diag[i] + 1; 1126 nz = ai[i+1] - a->diag[i] - 1; 1127 ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr); 1128 while (nz--) { 1129 Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++)); 1130 v += bs2; 1131 } 1132 Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs); 1133 ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr); 1134 } 1135 1136 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1137 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1138 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1139 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1140 PLogFlops(2*(a->bs2)*(a->nz) - a->bs*a->n); 1141 PetscFunctionReturn(0); 1142 } 1143 1144 #undef __FUNC__ 1145 #define __FUNC__ "MatSolve_SeqSBAIJ_7" 1146 int MatSolve_SeqSBAIJ_7(Mat A,Vec bb,Vec xx) 1147 { 1148 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1149 IS iscol=a->col,isrow=a->row; 1150 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1151 int *diag = a->diag; 1152 MatScalar *aa=a->a,*v; 1153 Scalar s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1154 Scalar *x,*b,*t; 1155 1156 PetscFunctionBegin; 1157 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1158 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1159 t = a->solve_work; 1160 1161 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1162 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1163 1164 /* forward solve the lower triangular */ 1165 idx = 7*(*r++); 1166 t[0] = b[idx]; t[1] = b[1+idx]; 1167 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1168 t[5] = b[5+idx]; t[6] = b[6+idx]; 1169 1170 for (i=1; i<n; i++) { 1171 v = aa + 49*ai[i]; 1172 vi = aj + ai[i]; 1173 nz = diag[i] - ai[i]; 1174 idx = 7*(*r++); 1175 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1176 s5 = b[4+idx];s6 = b[5+idx];s7 = b[6+idx]; 1177 while (nz--) { 1178 idx = 7*(*vi++); 1179 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1180 x4 = t[3+idx];x5 = t[4+idx]; 1181 x6 = t[5+idx];x7 = t[6+idx]; 1182 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1183 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1184 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1185 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1186 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1187 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1188 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1189 v += 49; 1190 } 1191 idx = 7*i; 1192 t[idx] = s1;t[1+idx] = s2; 1193 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1194 t[5+idx] = s6;t[6+idx] = s7; 1195 } 1196 /* backward solve the upper triangular */ 1197 for (i=n-1; i>=0; i--){ 1198 v = aa + 49*diag[i] + 49; 1199 vi = aj + diag[i] + 1; 1200 nz = ai[i+1] - diag[i] - 1; 1201 idt = 7*i; 1202 s1 = t[idt]; s2 = t[1+idt]; 1203 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1204 s6 = t[5+idt];s7 = t[6+idt]; 1205 while (nz--) { 1206 idx = 7*(*vi++); 1207 x1 = t[idx]; x2 = t[1+idx]; 1208 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1209 x6 = t[5+idx]; x7 = t[6+idx]; 1210 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1211 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1212 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1213 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1214 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1215 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1216 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1217 v += 49; 1218 } 1219 idc = 7*(*c--); 1220 v = aa + 49*diag[i]; 1221 x[idc] = t[idt] = v[0]*s1+v[7]*s2+v[14]*s3+ 1222 v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7; 1223 x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+ 1224 v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7; 1225 x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+ 1226 v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7; 1227 x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+ 1228 v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7; 1229 x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+ 1230 v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7; 1231 x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+ 1232 v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7; 1233 x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+ 1234 v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7; 1235 } 1236 1237 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1238 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1239 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1240 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1241 PLogFlops(2*49*(a->nz) - 7*a->n); 1242 PetscFunctionReturn(0); 1243 } 1244 1245 #undef __FUNC__ 1246 #define __FUNC__ "MatSolve_SeqSBAIJ_7_NaturalOrdering" 1247 int MatSolve_SeqSBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx) 1248 { 1249 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1250 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1251 int ierr,*diag = a->diag,jdx; 1252 MatScalar *aa=a->a,*v; 1253 Scalar *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7; 1254 1255 PetscFunctionBegin; 1256 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1257 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1258 /* forward solve the lower triangular */ 1259 idx = 0; 1260 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1261 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1262 x[6] = b[6+idx]; 1263 for (i=1; i<n; i++) { 1264 v = aa + 49*ai[i]; 1265 vi = aj + ai[i]; 1266 nz = diag[i] - ai[i]; 1267 idx = 7*i; 1268 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1269 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1270 s7 = b[6+idx]; 1271 while (nz--) { 1272 jdx = 7*(*vi++); 1273 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1274 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1275 x7 = x[6+jdx]; 1276 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1277 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1278 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1279 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1280 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1281 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1282 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1283 v += 49; 1284 } 1285 x[idx] = s1; 1286 x[1+idx] = s2; 1287 x[2+idx] = s3; 1288 x[3+idx] = s4; 1289 x[4+idx] = s5; 1290 x[5+idx] = s6; 1291 x[6+idx] = s7; 1292 } 1293 /* backward solve the upper triangular */ 1294 for (i=n-1; i>=0; i--){ 1295 v = aa + 49*diag[i] + 49; 1296 vi = aj + diag[i] + 1; 1297 nz = ai[i+1] - diag[i] - 1; 1298 idt = 7*i; 1299 s1 = x[idt]; s2 = x[1+idt]; 1300 s3 = x[2+idt]; s4 = x[3+idt]; 1301 s5 = x[4+idt]; s6 = x[5+idt]; 1302 s7 = x[6+idt]; 1303 while (nz--) { 1304 idx = 7*(*vi++); 1305 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1306 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1307 x7 = x[6+idx]; 1308 s1 -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7; 1309 s2 -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7; 1310 s3 -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7; 1311 s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7; 1312 s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7; 1313 s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7; 1314 s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7; 1315 v += 49; 1316 } 1317 v = aa + 49*diag[i]; 1318 x[idt] = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4 1319 + v[28]*s5 + v[35]*s6 + v[42]*s7; 1320 x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4 1321 + v[29]*s5 + v[36]*s6 + v[43]*s7; 1322 x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4 1323 + v[30]*s5 + v[37]*s6 + v[44]*s7; 1324 x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4 1325 + v[31]*s5 + v[38]*s6 + v[45]*s7; 1326 x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4 1327 + v[32]*s5 + v[39]*s6 + v[46]*s7; 1328 x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4 1329 + v[33]*s5 + v[40]*s6 + v[47]*s7; 1330 x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4 1331 + v[34]*s5 + v[41]*s6 + v[48]*s7; 1332 } 1333 1334 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1335 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1336 PLogFlops(2*36*(a->nz) - 6*a->n); 1337 PetscFunctionReturn(0); 1338 } 1339 1340 #undef __FUNC__ 1341 #define __FUNC__ "MatSolve_SeqSBAIJ_6" 1342 int MatSolve_SeqSBAIJ_6(Mat A,Vec bb,Vec xx) 1343 { 1344 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1345 IS iscol=a->col,isrow=a->row; 1346 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1347 int *diag = a->diag; 1348 MatScalar *aa=a->a,*v; 1349 Scalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t; 1350 1351 PetscFunctionBegin; 1352 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1353 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1354 t = a->solve_work; 1355 1356 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1357 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1358 1359 /* forward solve the lower triangular */ 1360 idx = 6*(*r++); 1361 t[0] = b[idx]; t[1] = b[1+idx]; 1362 t[2] = b[2+idx]; t[3] = b[3+idx]; 1363 t[4] = b[4+idx]; t[5] = b[5+idx]; 1364 for (i=1; i<n; i++) { 1365 v = aa + 36*ai[i]; 1366 vi = aj + ai[i]; 1367 nz = diag[i] - ai[i]; 1368 idx = 6*(*r++); 1369 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1370 s5 = b[4+idx]; s6 = b[5+idx]; 1371 while (nz--) { 1372 idx = 6*(*vi++); 1373 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1374 x4 = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx]; 1375 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1376 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1377 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1378 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1379 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1380 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1381 v += 36; 1382 } 1383 idx = 6*i; 1384 t[idx] = s1;t[1+idx] = s2; 1385 t[2+idx] = s3;t[3+idx] = s4; 1386 t[4+idx] = s5;t[5+idx] = s6; 1387 } 1388 /* backward solve the upper triangular */ 1389 for (i=n-1; i>=0; i--){ 1390 v = aa + 36*diag[i] + 36; 1391 vi = aj + diag[i] + 1; 1392 nz = ai[i+1] - diag[i] - 1; 1393 idt = 6*i; 1394 s1 = t[idt]; s2 = t[1+idt]; 1395 s3 = t[2+idt];s4 = t[3+idt]; 1396 s5 = t[4+idt];s6 = t[5+idt]; 1397 while (nz--) { 1398 idx = 6*(*vi++); 1399 x1 = t[idx]; x2 = t[1+idx]; 1400 x3 = t[2+idx]; x4 = t[3+idx]; 1401 x5 = t[4+idx]; x6 = t[5+idx]; 1402 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1403 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1404 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1405 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1406 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1407 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1408 v += 36; 1409 } 1410 idc = 6*(*c--); 1411 v = aa + 36*diag[i]; 1412 x[idc] = t[idt] = v[0]*s1+v[6]*s2+v[12]*s3+ 1413 v[18]*s4+v[24]*s5+v[30]*s6; 1414 x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+ 1415 v[19]*s4+v[25]*s5+v[31]*s6; 1416 x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+ 1417 v[20]*s4+v[26]*s5+v[32]*s6; 1418 x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+ 1419 v[21]*s4+v[27]*s5+v[33]*s6; 1420 x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+ 1421 v[22]*s4+v[28]*s5+v[34]*s6; 1422 x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+ 1423 v[23]*s4+v[29]*s5+v[35]*s6; 1424 } 1425 1426 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1427 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1428 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1429 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1430 PLogFlops(2*36*(a->nz) - 6*a->n); 1431 PetscFunctionReturn(0); 1432 } 1433 1434 #undef __FUNC__ 1435 #define __FUNC__ "MatSolve_SeqSBAIJ_6_NaturalOrdering" 1436 int MatSolve_SeqSBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx) 1437 { 1438 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1439 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1440 int ierr,*diag = a->diag,jdx; 1441 MatScalar *aa=a->a,*v; 1442 Scalar *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6; 1443 1444 PetscFunctionBegin; 1445 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1446 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1447 /* forward solve the lower triangular */ 1448 idx = 0; 1449 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; 1450 x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx]; 1451 for (i=1; i<n; i++) { 1452 v = aa + 36*ai[i]; 1453 vi = aj + ai[i]; 1454 nz = diag[i] - ai[i]; 1455 idx = 6*i; 1456 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1457 s4 = b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx]; 1458 while (nz--) { 1459 jdx = 6*(*vi++); 1460 x1 = x[jdx]; x2 = x[1+jdx]; x3 = x[2+jdx]; 1461 x4 = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx]; 1462 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1463 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1464 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1465 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1466 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1467 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1468 v += 36; 1469 } 1470 x[idx] = s1; 1471 x[1+idx] = s2; 1472 x[2+idx] = s3; 1473 x[3+idx] = s4; 1474 x[4+idx] = s5; 1475 x[5+idx] = s6; 1476 } 1477 /* backward solve the upper triangular */ 1478 for (i=n-1; i>=0; i--){ 1479 v = aa + 36*diag[i] + 36; 1480 vi = aj + diag[i] + 1; 1481 nz = ai[i+1] - diag[i] - 1; 1482 idt = 6*i; 1483 s1 = x[idt]; s2 = x[1+idt]; 1484 s3 = x[2+idt]; s4 = x[3+idt]; 1485 s5 = x[4+idt]; s6 = x[5+idt]; 1486 while (nz--) { 1487 idx = 6*(*vi++); 1488 x1 = x[idx]; x2 = x[1+idx]; x3 = x[2+idx]; 1489 x4 = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx]; 1490 s1 -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6; 1491 s2 -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6; 1492 s3 -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6; 1493 s4 -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6; 1494 s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6; 1495 s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6; 1496 v += 36; 1497 } 1498 v = aa + 36*diag[i]; 1499 x[idt] = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6; 1500 x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6; 1501 x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6; 1502 x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6; 1503 x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6; 1504 x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6; 1505 } 1506 1507 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1508 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1509 PLogFlops(2*36*(a->nz) - 6*a->n); 1510 PetscFunctionReturn(0); 1511 } 1512 1513 #undef __FUNC__ 1514 #define __FUNC__ "MatSolve_SeqSBAIJ_5" 1515 int MatSolve_SeqSBAIJ_5(Mat A,Vec bb,Vec xx) 1516 { 1517 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1518 IS iscol=a->col,isrow=a->row; 1519 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1520 int *diag = a->diag; 1521 MatScalar *aa=a->a,*v; 1522 Scalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t; 1523 1524 PetscFunctionBegin; 1525 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1526 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1527 t = a->solve_work; 1528 1529 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1530 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1531 1532 /* forward solve the lower triangular */ 1533 idx = 5*(*r++); 1534 t[0] = b[idx]; t[1] = b[1+idx]; 1535 t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx]; 1536 for (i=1; i<n; i++) { 1537 v = aa + 25*ai[i]; 1538 vi = aj + ai[i]; 1539 nz = diag[i] - ai[i]; 1540 idx = 5*(*r++); 1541 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1542 s5 = b[4+idx]; 1543 while (nz--) { 1544 idx = 5*(*vi++); 1545 x1 = t[idx]; x2 = t[1+idx];x3 = t[2+idx]; 1546 x4 = t[3+idx];x5 = t[4+idx]; 1547 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1548 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1549 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1550 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1551 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1552 v += 25; 1553 } 1554 idx = 5*i; 1555 t[idx] = s1;t[1+idx] = s2; 1556 t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5; 1557 } 1558 /* backward solve the upper triangular */ 1559 for (i=n-1; i>=0; i--){ 1560 v = aa + 25*diag[i] + 25; 1561 vi = aj + diag[i] + 1; 1562 nz = ai[i+1] - diag[i] - 1; 1563 idt = 5*i; 1564 s1 = t[idt]; s2 = t[1+idt]; 1565 s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt]; 1566 while (nz--) { 1567 idx = 5*(*vi++); 1568 x1 = t[idx]; x2 = t[1+idx]; 1569 x3 = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx]; 1570 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1571 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1572 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1573 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1574 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1575 v += 25; 1576 } 1577 idc = 5*(*c--); 1578 v = aa + 25*diag[i]; 1579 x[idc] = t[idt] = v[0]*s1+v[5]*s2+v[10]*s3+ 1580 v[15]*s4+v[20]*s5; 1581 x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+ 1582 v[16]*s4+v[21]*s5; 1583 x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+ 1584 v[17]*s4+v[22]*s5; 1585 x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+ 1586 v[18]*s4+v[23]*s5; 1587 x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+ 1588 v[19]*s4+v[24]*s5; 1589 } 1590 1591 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1592 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1593 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1594 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1595 PLogFlops(2*25*(a->nz) - 5*a->n); 1596 PetscFunctionReturn(0); 1597 } 1598 1599 #undef __FUNC__ 1600 #define __FUNC__ "MatSolve_SeqSBAIJ_5_NaturalOrdering" 1601 int MatSolve_SeqSBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx) 1602 { 1603 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1604 int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt; 1605 int ierr,*diag = a->diag,jdx; 1606 MatScalar *aa=a->a,*v; 1607 Scalar *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5; 1608 1609 PetscFunctionBegin; 1610 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1611 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1612 /* forward solve the lower triangular */ 1613 idx = 0; 1614 x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx]; 1615 for (i=1; i<n; i++) { 1616 v = aa + 25*ai[i]; 1617 vi = aj + ai[i]; 1618 nz = diag[i] - ai[i]; 1619 idx = 5*i; 1620 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx]; 1621 while (nz--) { 1622 jdx = 5*(*vi++); 1623 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx]; 1624 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1625 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1626 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1627 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1628 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1629 v += 25; 1630 } 1631 x[idx] = s1; 1632 x[1+idx] = s2; 1633 x[2+idx] = s3; 1634 x[3+idx] = s4; 1635 x[4+idx] = s5; 1636 } 1637 /* backward solve the upper triangular */ 1638 for (i=n-1; i>=0; i--){ 1639 v = aa + 25*diag[i] + 25; 1640 vi = aj + diag[i] + 1; 1641 nz = ai[i+1] - diag[i] - 1; 1642 idt = 5*i; 1643 s1 = x[idt]; s2 = x[1+idt]; 1644 s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt]; 1645 while (nz--) { 1646 idx = 5*(*vi++); 1647 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx]; 1648 s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5; 1649 s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5; 1650 s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5; 1651 s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5; 1652 s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5; 1653 v += 25; 1654 } 1655 v = aa + 25*diag[i]; 1656 x[idt] = v[0]*s1 + v[5]*s2 + v[10]*s3 + v[15]*s4 + v[20]*s5; 1657 x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3 + v[16]*s4 + v[21]*s5; 1658 x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3 + v[17]*s4 + v[22]*s5; 1659 x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3 + v[18]*s4 + v[23]*s5; 1660 x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3 + v[19]*s4 + v[24]*s5; 1661 } 1662 1663 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1664 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1665 PLogFlops(2*25*(a->nz) - 5*a->n); 1666 PetscFunctionReturn(0); 1667 } 1668 1669 #undef __FUNC__ 1670 #define __FUNC__ "MatSolve_SeqSBAIJ_4" 1671 int MatSolve_SeqSBAIJ_4(Mat A,Vec bb,Vec xx) 1672 { 1673 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1674 IS iscol=a->col,isrow=a->row; 1675 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1676 int *diag = a->diag; 1677 MatScalar *aa=a->a,*v; 1678 Scalar *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t; 1679 1680 PetscFunctionBegin; 1681 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1682 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1683 t = a->solve_work; 1684 1685 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1686 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1687 1688 /* forward solve the lower triangular */ 1689 idx = 4*(*r++); 1690 t[0] = b[idx]; t[1] = b[1+idx]; 1691 t[2] = b[2+idx]; t[3] = b[3+idx]; 1692 for (i=1; i<n; i++) { 1693 v = aa + 16*ai[i]; 1694 vi = aj + ai[i]; 1695 nz = diag[i] - ai[i]; 1696 idx = 4*(*r++); 1697 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1698 while (nz--) { 1699 idx = 4*(*vi++); 1700 x1 = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx]; 1701 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1702 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1703 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1704 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1705 v += 16; 1706 } 1707 idx = 4*i; 1708 t[idx] = s1;t[1+idx] = s2; 1709 t[2+idx] = s3;t[3+idx] = s4; 1710 } 1711 /* backward solve the upper triangular */ 1712 for (i=n-1; i>=0; i--){ 1713 v = aa + 16*diag[i] + 16; 1714 vi = aj + diag[i] + 1; 1715 nz = ai[i+1] - diag[i] - 1; 1716 idt = 4*i; 1717 s1 = t[idt]; s2 = t[1+idt]; 1718 s3 = t[2+idt];s4 = t[3+idt]; 1719 while (nz--) { 1720 idx = 4*(*vi++); 1721 x1 = t[idx]; x2 = t[1+idx]; 1722 x3 = t[2+idx]; x4 = t[3+idx]; 1723 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1724 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1725 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1726 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1727 v += 16; 1728 } 1729 idc = 4*(*c--); 1730 v = aa + 16*diag[i]; 1731 x[idc] = t[idt] = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4; 1732 x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4; 1733 x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4; 1734 x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4; 1735 } 1736 1737 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1738 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1739 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1740 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1741 PLogFlops(2*16*(a->nz) - 4*a->n); 1742 PetscFunctionReturn(0); 1743 } 1744 1745 1746 /* 1747 Special case where the matrix was ILU(0) factored in the natural 1748 ordering. This eliminates the need for the column and row permutation. 1749 */ 1750 #undef __FUNC__ 1751 #define __FUNC__ "MatSolve_SeqSBAIJ_4_NaturalOrdering" 1752 int MatSolve_SeqSBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx) 1753 { 1754 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1755 int n=a->mbs,*ai=a->i,*aj=a->j; 1756 int ierr,*diag = a->diag; 1757 MatScalar *aa=a->a; 1758 Scalar *x,*b; 1759 1760 PetscFunctionBegin; 1761 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1762 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1763 1764 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS) 1765 { 1766 static Scalar w[2000]; /* very BAD need to fix */ 1767 fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w); 1768 } 1769 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ) 1770 { 1771 static Scalar w[2000]; /* very BAD need to fix */ 1772 fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w); 1773 } 1774 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL) 1775 fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b); 1776 #else 1777 { 1778 Scalar s1,s2,s3,s4,x1,x2,x3,x4; 1779 MatScalar *v; 1780 int jdx,idt,idx,nz,*vi,i,ai16; 1781 1782 /* forward solve the lower triangular */ 1783 idx = 0; 1784 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3]; 1785 for (i=1; i<n; i++) { 1786 v = aa + 16*ai[i]; 1787 vi = aj + ai[i]; 1788 nz = diag[i] - ai[i]; 1789 idx += 4; 1790 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx]; 1791 while (nz--) { 1792 jdx = 4*(*vi++); 1793 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx]; 1794 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1795 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1796 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1797 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1798 v += 16; 1799 } 1800 x[idx] = s1; 1801 x[1+idx] = s2; 1802 x[2+idx] = s3; 1803 x[3+idx] = s4; 1804 } 1805 /* backward solve the upper triangular */ 1806 idt = 4*(n-1); 1807 for (i=n-1; i>=0; i--){ 1808 ai16 = 16*diag[i]; 1809 v = aa + ai16 + 16; 1810 vi = aj + diag[i] + 1; 1811 nz = ai[i+1] - diag[i] - 1; 1812 s1 = x[idt]; s2 = x[1+idt]; 1813 s3 = x[2+idt];s4 = x[3+idt]; 1814 while (nz--) { 1815 idx = 4*(*vi++); 1816 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; x4 = x[3+idx]; 1817 s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4; 1818 s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4; 1819 s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4; 1820 s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4; 1821 v += 16; 1822 } 1823 v = aa + ai16; 1824 x[idt] = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4; 1825 x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4; 1826 x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4; 1827 x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4; 1828 idt -= 4; 1829 } 1830 } 1831 #endif 1832 1833 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1834 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1835 PLogFlops(2*16*(a->nz) - 4*a->n); 1836 PetscFunctionReturn(0); 1837 } 1838 1839 #undef __FUNC__ 1840 #define __FUNC__ "MatSolve_SeqSBAIJ_3" 1841 int MatSolve_SeqSBAIJ_3(Mat A,Vec bb,Vec xx) 1842 { 1843 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1844 IS iscol=a->col,isrow=a->row; 1845 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1846 int *diag = a->diag; 1847 MatScalar *aa=a->a,*v; 1848 Scalar *x,*b,s1,s2,s3,x1,x2,x3,*t; 1849 1850 PetscFunctionBegin; 1851 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1852 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1853 t = a->solve_work; 1854 1855 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1856 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1857 1858 /* forward solve the lower triangular */ 1859 idx = 3*(*r++); 1860 t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx]; 1861 for (i=1; i<n; i++) { 1862 v = aa + 9*ai[i]; 1863 vi = aj + ai[i]; 1864 nz = diag[i] - ai[i]; 1865 idx = 3*(*r++); 1866 s1 = b[idx]; s2 = b[1+idx]; s3 = b[2+idx]; 1867 while (nz--) { 1868 idx = 3*(*vi++); 1869 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1870 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1871 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1872 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1873 v += 9; 1874 } 1875 idx = 3*i; 1876 t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3; 1877 } 1878 /* backward solve the upper triangular */ 1879 for (i=n-1; i>=0; i--){ 1880 v = aa + 9*diag[i] + 9; 1881 vi = aj + diag[i] + 1; 1882 nz = ai[i+1] - diag[i] - 1; 1883 idt = 3*i; 1884 s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt]; 1885 while (nz--) { 1886 idx = 3*(*vi++); 1887 x1 = t[idx]; x2 = t[1+idx]; x3 = t[2+idx]; 1888 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1889 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1890 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1891 v += 9; 1892 } 1893 idc = 3*(*c--); 1894 v = aa + 9*diag[i]; 1895 x[idc] = t[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1896 x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1897 x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1898 } 1899 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 1900 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 1901 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1902 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1903 PLogFlops(2*9*(a->nz) - 3*a->n); 1904 PetscFunctionReturn(0); 1905 } 1906 1907 /* 1908 Special case where the matrix was ILU(0) factored in the natural 1909 ordering. This eliminates the need for the column and row permutation. 1910 */ 1911 #undef __FUNC__ 1912 #define __FUNC__ "MatSolve_SeqSBAIJ_3_NaturalOrdering" 1913 int MatSolve_SeqSBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx) 1914 { 1915 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 1916 int n=a->mbs,*ai=a->i,*aj=a->j; 1917 int ierr,*diag = a->diag; 1918 MatScalar *aa=a->a,*v; 1919 Scalar *x,*b,s1,s2,s3,x1,x2,x3; 1920 int jdx,idt,idx,nz,*vi,i; 1921 1922 PetscFunctionBegin; 1923 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1924 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1925 1926 1927 /* forward solve the lower triangular */ 1928 idx = 0; 1929 x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; 1930 for (i=1; i<n; i++) { 1931 v = aa + 9*ai[i]; 1932 vi = aj + ai[i]; 1933 nz = diag[i] - ai[i]; 1934 idx += 3; 1935 s1 = b[idx];s2 = b[1+idx];s3 = b[2+idx]; 1936 while (nz--) { 1937 jdx = 3*(*vi++); 1938 x1 = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx]; 1939 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1940 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1941 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1942 v += 9; 1943 } 1944 x[idx] = s1; 1945 x[1+idx] = s2; 1946 x[2+idx] = s3; 1947 } 1948 /* backward solve the upper triangular */ 1949 for (i=n-1; i>=0; i--){ 1950 v = aa + 9*diag[i] + 9; 1951 vi = aj + diag[i] + 1; 1952 nz = ai[i+1] - diag[i] - 1; 1953 idt = 3*i; 1954 s1 = x[idt]; s2 = x[1+idt]; 1955 s3 = x[2+idt]; 1956 while (nz--) { 1957 idx = 3*(*vi++); 1958 x1 = x[idx]; x2 = x[1+idx];x3 = x[2+idx]; 1959 s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3; 1960 s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3; 1961 s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3; 1962 v += 9; 1963 } 1964 v = aa + 9*diag[i]; 1965 x[idt] = v[0]*s1 + v[3]*s2 + v[6]*s3; 1966 x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3; 1967 x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3; 1968 } 1969 1970 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 1971 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 1972 PLogFlops(2*9*(a->nz) - 3*a->n); 1973 PetscFunctionReturn(0); 1974 } 1975 1976 #undef __FUNC__ 1977 #define __FUNC__ "MatSolve_SeqSBAIJ_2" 1978 int MatSolve_SeqSBAIJ_2(Mat A,Vec bb,Vec xx) 1979 { 1980 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 1981 IS iscol=a->col,isrow=a->row; 1982 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout; 1983 int *diag = a->diag; 1984 MatScalar *aa=a->a,*v; 1985 Scalar *x,*b,s1,s2,x1,x2,*t; 1986 1987 PetscFunctionBegin; 1988 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 1989 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 1990 t = a->solve_work; 1991 1992 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 1993 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 1994 1995 /* forward solve the lower triangular */ 1996 idx = 2*(*r++); 1997 t[0] = b[idx]; t[1] = b[1+idx]; 1998 for (i=1; i<n; i++) { 1999 v = aa + 4*ai[i]; 2000 vi = aj + ai[i]; 2001 nz = diag[i] - ai[i]; 2002 idx = 2*(*r++); 2003 s1 = b[idx]; s2 = b[1+idx]; 2004 while (nz--) { 2005 idx = 2*(*vi++); 2006 x1 = t[idx]; x2 = t[1+idx]; 2007 s1 -= v[0]*x1 + v[2]*x2; 2008 s2 -= v[1]*x1 + v[3]*x2; 2009 v += 4; 2010 } 2011 idx = 2*i; 2012 t[idx] = s1; t[1+idx] = s2; 2013 } 2014 /* backward solve the upper triangular */ 2015 for (i=n-1; i>=0; i--){ 2016 v = aa + 4*diag[i] + 4; 2017 vi = aj + diag[i] + 1; 2018 nz = ai[i+1] - diag[i] - 1; 2019 idt = 2*i; 2020 s1 = t[idt]; s2 = t[1+idt]; 2021 while (nz--) { 2022 idx = 2*(*vi++); 2023 x1 = t[idx]; x2 = t[1+idx]; 2024 s1 -= v[0]*x1 + v[2]*x2; 2025 s2 -= v[1]*x1 + v[3]*x2; 2026 v += 4; 2027 } 2028 idc = 2*(*c--); 2029 v = aa + 4*diag[i]; 2030 x[idc] = t[idt] = v[0]*s1 + v[2]*s2; 2031 x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2; 2032 } 2033 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2034 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2035 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2036 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2037 PLogFlops(2*4*(a->nz) - 2*a->n); 2038 PetscFunctionReturn(0); 2039 } 2040 2041 /* 2042 Special case where the matrix was ILU(0) factored in the natural 2043 ordering. This eliminates the need for the column and row permutation. 2044 */ 2045 #undef __FUNC__ 2046 #define __FUNC__ "MatSolve_SeqSBAIJ_2_NaturalOrdering" 2047 int MatSolve_SeqSBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx) 2048 { 2049 Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; 2050 int n=a->mbs,*ai=a->i,*aj=a->j; 2051 int ierr,*diag = a->diag; 2052 MatScalar *aa=a->a,*v; 2053 Scalar *x,*b,s1,s2,x1,x2; 2054 int jdx,idt,idx,nz,*vi,i; 2055 2056 PetscFunctionBegin; 2057 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2058 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2059 2060 /* forward solve the lower triangular */ 2061 idx = 0; 2062 x[0] = b[0]; x[1] = b[1]; 2063 for (i=1; i<n; i++) { 2064 v = aa + 4*ai[i]; 2065 vi = aj + ai[i]; 2066 nz = diag[i] - ai[i]; 2067 idx += 2; 2068 s1 = b[idx];s2 = b[1+idx]; 2069 while (nz--) { 2070 jdx = 2*(*vi++); 2071 x1 = x[jdx];x2 = x[1+jdx]; 2072 s1 -= v[0]*x1 + v[2]*x2; 2073 s2 -= v[1]*x1 + v[3]*x2; 2074 v += 4; 2075 } 2076 x[idx] = s1; 2077 x[1+idx] = s2; 2078 } 2079 /* backward solve the upper triangular */ 2080 for (i=n-1; i>=0; i--){ 2081 v = aa + 4*diag[i] + 4; 2082 vi = aj + diag[i] + 1; 2083 nz = ai[i+1] - diag[i] - 1; 2084 idt = 2*i; 2085 s1 = x[idt]; s2 = x[1+idt]; 2086 while (nz--) { 2087 idx = 2*(*vi++); 2088 x1 = x[idx]; x2 = x[1+idx]; 2089 s1 -= v[0]*x1 + v[2]*x2; 2090 s2 -= v[1]*x1 + v[3]*x2; 2091 v += 4; 2092 } 2093 v = aa + 4*diag[i]; 2094 x[idt] = v[0]*s1 + v[2]*s2; 2095 x[1+idt] = v[1]*s1 + v[3]*s2; 2096 } 2097 2098 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2099 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2100 PLogFlops(2*4*(a->nz) - 2*a->n); 2101 PetscFunctionReturn(0); 2102 } 2103 2104 /*----------- New 1 --------------*/ 2105 #undef __FUNC__ 2106 #define __FUNC__ "MatSolve_SeqSBAIJ_1" 2107 int MatSolve_SeqSBAIJ_1(Mat A,Vec bb,Vec xx) 2108 { 2109 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2110 IS isrow=a->row; 2111 int mbs=a->mbs,*ai=a->i,*aj=a->j,ierr,*rip; 2112 MatScalar *aa=a->a,*v; 2113 Scalar *x,*b,xk,*xtmp; 2114 int nz,*vj,k; 2115 2116 PetscFunctionBegin; 2117 if (!mbs) PetscFunctionReturn(0); 2118 2119 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2120 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2121 xtmp = a->solve_work; 2122 2123 ierr = ISGetIndices(isrow,&rip);CHKERRQ(ierr); 2124 2125 /* solve U'*D*y = b by forward substitution */ 2126 for (k=0; k<mbs; k++) xtmp[k] = b[rip[k]]; 2127 for (k=0; k<mbs; k++){ 2128 v = aa + ai[k]; 2129 vj = aj + ai[k]; 2130 xk = xtmp[k]; 2131 nz = ai[k+1] - ai[k]; 2132 while (nz--) xtmp[*vj++] += (*v++) * xk; 2133 xtmp[k] = xk*aa[k]; /* note: aa[k] = 1/D(k) */ 2134 } 2135 2136 /* solve U*x = y by back substitution */ 2137 2138 for (k=mbs-1; k>=0; k--){ 2139 v = aa + ai[k]; 2140 vj = aj + ai[k]; 2141 xk = xtmp[k]; 2142 nz = ai[k+1] - ai[k]; 2143 while (nz--) xk += (*v++) * xtmp[*vj++]; 2144 xtmp[k] = xk; 2145 x[rip[k]] = xk; 2146 } 2147 2148 ierr = ISRestoreIndices(isrow,&rip);CHKERRQ(ierr); 2149 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2150 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2151 PLogFlops(4*a->s_nz + a->m); 2152 PetscFunctionReturn(0); 2153 } 2154 2155 2156 #ifdef CONT 2157 Mat_SeqBAIJ *a=(Mat_SeqBAIJ *)A->data; 2158 IS iscol=a->col,isrow=a->row; 2159 int *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout; 2160 int *diag = a->diag; 2161 MatScalar *aa=a->a,*v; 2162 Scalar *x,*b,s1,*t; 2163 2164 PetscFunctionBegin; 2165 if (!n) PetscFunctionReturn(0); 2166 2167 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2168 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2169 t = a->solve_work; 2170 2171 ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout; 2172 ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1); 2173 2174 /* forward solve the lower triangular */ 2175 t[0] = b[*r++]; 2176 for (i=1; i<n; i++) { 2177 v = aa + ai[i]; 2178 vi = aj + ai[i]; 2179 nz = diag[i] - ai[i]; 2180 s1 = b[*r++]; 2181 while (nz--) { 2182 s1 -= (*v++)*t[*vi++]; 2183 } 2184 t[i] = s1; 2185 } 2186 /* backward solve the upper triangular */ 2187 for (i=n-1; i>=0; i--){ 2188 v = aa + diag[i] + 1; 2189 vi = aj + diag[i] + 1; 2190 nz = ai[i+1] - diag[i] - 1; 2191 s1 = t[i]; 2192 while (nz--) { 2193 s1 -= (*v++)*t[*vi++]; 2194 } 2195 x[*c--] = t[i] = aa[diag[i]]*s1; 2196 } 2197 2198 ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr); 2199 ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr); 2200 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2201 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2202 PLogFlops(2*1*(a->nz) - a->n); 2203 PetscFunctionReturn(0); 2204 } 2205 2206 #endif 2207 /*----------- New 1 --------------*/ 2208 /* 2209 Special case where the matrix was ILU(0) factored in the natural 2210 ordering. This eliminates the need for the column and row permutation. 2211 */ 2212 #undef __FUNC__ 2213 #define __FUNC__ "MatSolve_SeqSBAIJ_1_NaturalOrdering" 2214 int MatSolve_SeqSBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx) 2215 { 2216 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ *)A->data; 2217 int mbs=a->mbs,*ai=a->i,*aj=a->j,ierr; 2218 MatScalar *aa=a->a,*v; 2219 Scalar *x,*b,xk; 2220 int nz,*vj,k; 2221 2222 PetscFunctionBegin; 2223 ierr = VecGetArray(bb,&b);CHKERRQ(ierr); 2224 ierr = VecGetArray(xx,&x);CHKERRQ(ierr); 2225 2226 /* solve U'*D*y = b by forward substitution */ 2227 for (k=0; k<mbs; k++) x[k] = b[k]; 2228 for (k=0; k<mbs; k++){ 2229 v = aa + ai[k]; 2230 vj = aj + ai[k]; 2231 xk = x[k]; 2232 nz = ai[k+1] - ai[k]; 2233 while (nz--) x[*vj++] += (*v++) * xk; 2234 x[k] = xk*aa[k]; /* note: aa[k] = 1/D(k) */ 2235 } 2236 2237 /* solve U*x = y by back substitution */ 2238 for (k=mbs-2; k>=0; k--){ 2239 v = aa + ai[k]; 2240 vj = aj + ai[k]; 2241 xk = x[k]; 2242 nz = ai[k+1] - ai[k]; 2243 while (nz--) xk += (*v++) * x[*vj++]; 2244 x[k] = xk; 2245 } 2246 2247 ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr); 2248 ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr); 2249 PLogFlops(4*a->s_nz + a->m); 2250 PetscFunctionReturn(0); 2251 } 2252 2253 /* ----------------------------------------------------------------*/ 2254 #undef __FUNC__ 2255 #define __FUNC__ "MatILUFactorSymbolic_SeqSBAIJ" 2256 int MatILUFactorSymbolic_SeqSBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *B) 2257 { 2258 Mat_SeqSBAIJ *a = (Mat_SeqSBAIJ*)A->data,*b; 2259 IS isicol; 2260 int *rip,*riip,ierr,i,mbs = a->mbs,*ai = a->i,*aj = a->j; 2261 int *jutmp,bs = a->bs,bs2=a->bs2; 2262 int m,nzi,realloc = 0,*levtmp; 2263 int *jl,*q,jumin,jmin,jmax,juptr,nzk,qm,*iu,*ju,k,j,vj,umax,maxadd; 2264 int levels,incrlev,*lev,lev_ik,shift; 2265 PetscReal f; 2266 2267 PetscFunctionBegin; 2268 if (info) { 2269 f = info->fill; 2270 levels = (int)info->levels; 2271 } else { 2272 f = 1.0; 2273 levels = 0; 2274 } 2275 PetscValidHeaderSpecific(isrow,IS_COOKIE); 2276 PetscValidHeaderSpecific(iscol,IS_COOKIE); 2277 /* if (A->M != A->N) SETERRQ(PETSC_ERR_ARG_WRONG,0,"matrix must be square");*/ 2278 ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr); 2279 ierr = ISGetIndices(isrow,&rip);CHKERRQ(ierr); 2280 ierr = ISGetIndices(isicol,&riip);CHKERRQ(ierr); 2281 for (k=0; k<mbs; k++) { 2282 if ( rip[k] - riip[k] != 0 ) { 2283 printf("Non-symm. permutation, use symm. permutation or general matrix format\n"); 2284 break; 2285 } 2286 } 2287 2288 /* initialization */ 2289 /* Don't know how many column pointers are needed so estimate. 2290 Use Modified Sparse Row storage for u and ju, see Sasd pp.85 */ 2291 2292 umax = (int)(f*ai[mbs] + 1); 2293 lev = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(lev); 2294 umax += mbs + 1; 2295 shift = mbs + 1; 2296 ju = iu = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(ju); 2297 iu[0] = mbs+1; 2298 juptr = mbs; 2299 jl = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(jl); 2300 q = (int*)PetscMalloc(mbs*sizeof(int));CHKPTRQ(q); 2301 levtmp = (int*)PetscMalloc((mbs+1)*sizeof(int));CHKPTRQ(levtmp); 2302 for (i=0; i<mbs; i++){ 2303 jl[i] = mbs; q[i] = 0; 2304 } 2305 2306 /* for each row k */ 2307 for (k=0; k<mbs; k++){ 2308 nzk = 0; 2309 q[k] = mbs; 2310 /* initialize nonzero structure of k-th row to row rip[k] of A */ 2311 jmin = ai[rip[k]]; 2312 jmax = ai[rip[k]+1]; 2313 for (j=jmin; j<jmax; j++){ 2314 vj = riip[aj[j]]; 2315 if(vj > k){ 2316 qm = k; 2317 do { 2318 m = qm; qm = q[m]; 2319 } while(qm < vj); 2320 if (qm == vj) { 2321 printf(" error: duplicate entry in A\n"); break; 2322 } 2323 nzk++; 2324 q[m] = vj; 2325 q[vj] = qm; 2326 levtmp[vj] = 0; /* initialize lev for nonzero element */ 2327 } /* if(vj > k) */ 2328 } /* for (j=jmin; j<jmax; j++) */ 2329 2330 /* modify nonzero structure of k-th row by computing fill-in 2331 for each row i to be merged in */ 2332 i = k; 2333 i = jl[i]; /* next pivot row (== 0 for symbolic factorization) */ 2334 /* printf(" next pivot row i=%d\n",i); */ 2335 while (i < mbs){ 2336 /* merge row i into k-th row */ 2337 j=iu[i]; 2338 vj = ju[j]; 2339 lev_ik = lev[j-shift]; 2340 nzi = iu[i+1] - (iu[i]+1); 2341 jmin = iu[i] + 1; jmax = iu[i] + nzi; 2342 qm = k; 2343 for (j=jmin; j<jmax+1; j++){ 2344 vj = ju[j]; 2345 incrlev = lev[j-shift]+lev_ik+1; 2346 2347 if (incrlev > levels) continue; 2348 do { 2349 m = qm; qm = q[m]; 2350 } while (qm < vj); 2351 if (qm != vj){ /* a fill */ 2352 nzk++; q[m] = vj; q[vj] = qm; qm = vj; 2353 levtmp[vj] = incrlev; 2354 } 2355 else { /* already a nonzero element */ 2356 if (levtmp[vj]>incrlev) levtmp[vj] = incrlev; 2357 } 2358 } 2359 i = jl[i]; /* next pivot row */ 2360 } 2361 2362 /* add k to row list for first nonzero element in k-th row */ 2363 if (nzk > 1){ 2364 i = q[k]; /* col value of first nonzero element in k_th row of U */ 2365 jl[k] = jl[i]; jl[i] = k; 2366 } 2367 iu[k+1] = iu[k] + nzk; /* printf(" iu[%d]=%d, umax=%d\n", k+1, iu[k+1],umax);*/ 2368 2369 /* allocate more space to ju and lev if needed */ 2370 if (iu[k+1] > umax) { printf("allocate more space, iu[%d]=%d > umax=%d\n",k+1, iu[k+1],umax); 2371 /* estimate how much additional space we will need */ 2372 /* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */ 2373 /* just double the memory each time */ 2374 maxadd = umax; 2375 if (maxadd < nzk) maxadd = (mbs-k)*(nzk+1)/2; 2376 umax += maxadd; 2377 2378 /* allocate a longer ju (NOTE: iu poits to the beginning of ju) */ 2379 jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp); 2380 ierr = PetscMemcpy(jutmp,ju,iu[k]*sizeof(int));CHKERRQ(ierr); 2381 ierr = PetscFree(ju);CHKERRQ(ierr); 2382 ju = iu = jutmp; 2383 2384 jutmp = (int*)PetscMalloc(umax*sizeof(int));CHKPTRQ(jutmp); 2385 ierr = PetscMemcpy(jutmp,lev,(iu[k]-shift)*sizeof(int));CHKERRQ(ierr); 2386 ierr = PetscFree(lev);CHKERRQ(ierr); 2387 lev = jutmp; 2388 2389 realloc += 2; /* count how many times we realloc */ 2390 } 2391 2392 /* save nonzero structure of k-th row in ju */ 2393 i=k; 2394 jumin = juptr + 1; juptr += nzk; 2395 for (j=jumin; j<juptr+1; j++){ 2396 i = q[i]; 2397 ju[j] = i; 2398 lev[j-shift] = levtmp[i]; 2399 /* printf(" k=%d, ju[%d]=%d\n",k,j,ju[j]);*/ 2400 } 2401 /* printf("\n"); */ 2402 } /* for (k=0; k<mbs; k++) */ 2403 2404 if (ai[mbs] != 0) { 2405 PetscReal af = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2406 PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af); 2407 PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Run with -pc_lu_fill %g or use \n",af); 2408 PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:PCLUSetFill(pc,%g);\n",af); 2409 PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:for best performance.\n"); 2410 } else { 2411 PLogInfo(A,"MatLUFactorSymbolic_SeqSBAIJ:Empty matrix.\n"); 2412 } 2413 2414 ierr = ISRestoreIndices(isrow,&rip);CHKERRQ(ierr); 2415 ierr = ISRestoreIndices(isicol,&riip);CHKERRQ(ierr); 2416 2417 ierr = PetscFree(q);CHKERRQ(ierr); 2418 ierr = PetscFree(jl);CHKERRQ(ierr); 2419 2420 /* put together the new matrix */ 2421 ierr = MatCreateSeqSBAIJ(A->comm,bs,bs*mbs,bs*mbs,0,PETSC_NULL,B);CHKERRQ(ierr); 2422 PLogObjectParent(*B,isicol); 2423 b = (Mat_SeqSBAIJ*)(*B)->data; 2424 ierr = PetscFree(b->imax);CHKERRQ(ierr); 2425 b->singlemalloc = PETSC_FALSE; 2426 /* the next line frees the default space generated by the Create() */ 2427 ierr = PetscFree(b->a);CHKERRQ(ierr); 2428 ierr = PetscFree(b->ilen);CHKERRQ(ierr); 2429 b->a = (MatScalar*)PetscMalloc((iu[mbs]+1)*sizeof(MatScalar)*bs2);CHKPTRQ(b->a); 2430 b->j = ju; 2431 b->i = iu; 2432 b->diag = 0; 2433 b->ilen = 0; 2434 b->imax = 0; 2435 b->row = isrow; 2436 b->col = iscol; 2437 ierr = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr); 2438 ierr = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr); 2439 b->icol = isicol; 2440 b->solve_work = (Scalar*)PetscMalloc((bs*mbs+bs)*sizeof(Scalar));CHKPTRQ(b->solve_work); 2441 /* In b structure: Free imax, ilen, old a, old j. 2442 Allocate idnew, solve_work, new a, new j */ 2443 PLogObjectMemory(*B,(iu[mbs]-mbs)*(sizeof(int)+sizeof(MatScalar))); 2444 b->s_maxnz = b->s_nz = iu[mbs]; 2445 2446 (*B)->factor = FACTOR_LU; 2447 (*B)->info.factor_mallocs = realloc; 2448 (*B)->info.fill_ratio_given = f; 2449 if (ai[mbs] != 0) { 2450 (*B)->info.fill_ratio_needed = ((PetscReal)iu[mbs])/((PetscReal)ai[mbs]); 2451 } else { 2452 (*B)->info.fill_ratio_needed = 0.0; 2453 } 2454 2455 PetscFunctionReturn(0); 2456 } 2457 2458 2459 2460